hud-python 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show
  1. hud/__init__.py +16 -12
  2. hud/adapters/__init__.py +4 -2
  3. hud/adapters/claude/adapter.py +0 -1
  4. hud/adapters/common/adapter.py +11 -10
  5. hud/adapters/common/types.py +27 -13
  6. hud/adapters/operator/__init__.py +5 -0
  7. hud/adapters/operator/adapter.py +93 -0
  8. hud/agent/__init__.py +7 -0
  9. hud/agent/base.py +109 -0
  10. hud/agent/claude.py +187 -0
  11. hud/agent/operator.py +190 -0
  12. hud/env/__init__.py +11 -0
  13. hud/env/client.py +35 -0
  14. hud/env/docker_client.py +306 -0
  15. hud/env/environment.py +181 -0
  16. hud/env/local_docker_client.py +249 -0
  17. hud/env/remote_client.py +185 -0
  18. hud/env/remote_docker_client.py +221 -0
  19. hud/evaluators/__init__.py +10 -0
  20. hud/evaluators/base.py +31 -0
  21. hud/evaluators/inspect.py +29 -0
  22. hud/evaluators/judge.py +213 -0
  23. hud/evaluators/match.py +163 -0
  24. hud/evaluators/remote.py +78 -0
  25. hud/gym.py +101 -15
  26. hud/job.py +185 -0
  27. hud/server/__init__.py +2 -2
  28. hud/server/requests.py +87 -0
  29. hud/settings.py +13 -2
  30. hud/task.py +133 -0
  31. hud/taskset.py +95 -0
  32. hud/trajectory.py +90 -0
  33. hud/types.py +65 -0
  34. hud/utils/__init__.py +4 -2
  35. hud/utils/common.py +69 -0
  36. hud/utils/config.py +182 -4
  37. hud/utils/telemetry.py +67 -0
  38. hud_python-0.2.0.dist-info/METADATA +188 -0
  39. hud_python-0.2.0.dist-info/RECORD +44 -0
  40. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/licenses/LICENSE +1 -1
  41. hud/client.py +0 -200
  42. hud/environment.py +0 -318
  43. hud/run.py +0 -208
  44. hud_python-0.1.5.dist-info/METADATA +0 -125
  45. hud_python-0.1.5.dist-info/RECORD +0 -21
  46. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/WHEEL +0 -0
hud/agent/operator.py ADDED
@@ -0,0 +1,190 @@
1
+ import json
2
+ import logging
3
+ import os
4
+ from typing import Any, Literal, cast
5
+
6
+ from openai import OpenAI
7
+ from openai.types.responses import (
8
+ ToolParam,
9
+ ResponseInputParam,
10
+ ResponseInputItemParam,
11
+ ResponseOutputMessage,
12
+ ResponseComputerToolCall
13
+ )
14
+
15
+ from hud.agent.base import Agent
16
+ from hud.adapters.operator import OperatorAdapter
17
+ from hud.env.environment import Observation
18
+ from hud.settings import settings
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
23
+ """
24
+ An agent implementation using OpenAI's Computer Use API.
25
+
26
+ This agent interacts with HUD environments using OpenAI's Computer Use API
27
+ through the OperatorAdapter which converts actions to the format expected by HUD.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ client: OpenAI | None = None,
33
+ model: str = "computer-use-preview",
34
+ environment: Literal["windows", "mac", "linux", "browser"] = "windows",
35
+ adapter: OperatorAdapter | None = None,
36
+ max_iterations: int = 8
37
+ ):
38
+ """
39
+ Initialize the OperatorAgent.
40
+
41
+ Args:
42
+ client: The OpenAI client for API calls (optional, created automatically if not provided)
43
+ model: The model to use for computer use
44
+ environment: The environment type (windows, mac, linux, browser)
45
+ adapter: The adapter to use for preprocessing and postprocessing
46
+ max_iterations: Maximum number of iterations for the agent
47
+ """
48
+ # Initialize client if not provided
49
+ if client is None:
50
+ # Get API key from settings
51
+ api_key = settings.openai_api_key
52
+ if not api_key:
53
+ raise ValueError("OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY.")
54
+
55
+ # Create synchronous client
56
+ client = OpenAI(api_key=api_key)
57
+
58
+ super().__init__(client=client, adapter=adapter)
59
+
60
+ self.model = model
61
+ self.environment = environment
62
+ self.max_iterations = max_iterations
63
+
64
+ # Default dimensions
65
+ self.width = 1024
66
+ self.height = 768
67
+
68
+ # Update dimensions if adapter is provided
69
+ if self.adapter:
70
+ self.width = self.adapter.agent_width
71
+ self.height = self.adapter.agent_height
72
+
73
+ # Message history and state tracking
74
+ self.last_response_id = None
75
+ self.pending_call_id = None
76
+ self.initial_prompt = None
77
+
78
+ async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
79
+ """
80
+ Fetch a response from the model based on the observation.
81
+
82
+ Args:
83
+ observation: The preprocessed observation
84
+
85
+ Returns:
86
+ tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions and a
87
+ boolean indicating if the agent believes the task is complete
88
+ """
89
+ if not self.client:
90
+ raise ValueError("Client is required")
91
+
92
+ # Define the computer use tool with correct type using cast
93
+ computer_tool = cast(ToolParam, {
94
+ "type": "computer_use_preview",
95
+ "display_width": self.width,
96
+ "display_height": self.height,
97
+ "environment": self.environment
98
+ })
99
+
100
+ # Process the observation based on whether it's the first one or a response to an action
101
+ if self.pending_call_id is None and self.last_response_id is None:
102
+ # This is the first observation, store and send the prompt
103
+ self.initial_prompt = observation.text
104
+
105
+ # Create the initial request following the required structure
106
+ input_content: list[dict[str, Any]] = [
107
+ {"type": "input_text", "text": observation.text or ""}
108
+ ]
109
+
110
+ # Add screenshot if present
111
+ if observation.screenshot:
112
+ input_content.append({
113
+ "type": "input_image",
114
+ "image_url": f"data:image/png;base64,{observation.screenshot}"
115
+ })
116
+
117
+ # Structure the input correctly for the API using cast
118
+ input_param = cast(ResponseInputParam, [{
119
+ "role": "user",
120
+ "content": input_content
121
+ }])
122
+
123
+ # Call OpenAI API for the initial prompt (synchronous call)
124
+ response = self.client.responses.create(
125
+ model=self.model,
126
+ tools=[computer_tool],
127
+ input=input_param,
128
+ truncation="auto"
129
+ )
130
+
131
+ else:
132
+ # This is a response to a previous action
133
+ if not observation.screenshot:
134
+ logger.warning("No screenshot provided for response to action")
135
+ return [], True
136
+
137
+ # Create a response to the previous action with the new screenshot
138
+ input_param_followup = cast(ResponseInputParam, [
139
+ cast(ResponseInputItemParam, {
140
+ "call_id": self.pending_call_id,
141
+ "type": "computer_call_output",
142
+ "output": {
143
+ "type": "input_image",
144
+ "image_url": f"data:image/png;base64,{observation.screenshot}"
145
+ }
146
+ })
147
+ ])
148
+
149
+ # Call OpenAI API for follow-up (synchronous call)
150
+ response = self.client.responses.create(
151
+ model=self.model,
152
+ previous_response_id=self.last_response_id,
153
+ tools=[computer_tool],
154
+ input=input_param_followup,
155
+ truncation="auto"
156
+ )
157
+
158
+ # Store the response ID for the next call
159
+ self.last_response_id = response.id
160
+
161
+ # Process the response to extract computer calls
162
+ actions = []
163
+ done = True # Assume we're done unless we find a computer call
164
+
165
+ # Loop through all items in the output to find computer_call items
166
+ computer_calls = [
167
+ item for item in response.output
168
+ if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
169
+ ]
170
+
171
+ if computer_calls:
172
+ # Extract the computer calls and mark that we're not done
173
+ done = False
174
+
175
+ # Process all computer calls
176
+ for computer_call in computer_calls:
177
+ self.pending_call_id = computer_call.call_id
178
+ action = computer_call.action
179
+ actions.append(action.model_dump())
180
+
181
+ # Log the action
182
+ logger.info(f"Computer call action: {action}")
183
+ else:
184
+ # If there are no computer calls, print some debug info
185
+ logger.info("No computer call found in the response. Either complete or error.")
186
+ for item in response.output:
187
+ if isinstance(item, ResponseOutputMessage) and item.type == "message":
188
+ logger.info(f"Message: {item.content}")
189
+
190
+ return actions, done
hud/env/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ from __future__ import annotations
2
+
3
+ from . import docker_client, environment, local_docker_client, remote_client, remote_docker_client
4
+
5
+ __all__ = [
6
+ "docker_client",
7
+ "environment",
8
+ "local_docker_client",
9
+ "remote_client",
10
+ "remote_docker_client",
11
+ ]
hud/env/client.py ADDED
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ from pydantic import BaseModel
7
+
8
+ if TYPE_CHECKING:
9
+ from hud.types import EnvironmentStatus
10
+ from hud.utils.config import HudStyleConfig
11
+
12
+
13
+ class Client(BaseModel, ABC):
14
+ """
15
+ Base class for all environment clients.
16
+ """
17
+
18
+ @abstractmethod
19
+ async def invoke(self, config: HudStyleConfig) -> Any:
20
+ """
21
+ Invoke the environment with the given config.
22
+ """
23
+
24
+ @abstractmethod
25
+ async def get_status(self) -> EnvironmentStatus:
26
+ """
27
+ Get the current status of the environment.
28
+ """
29
+
30
+ @abstractmethod
31
+ async def close(self) -> None:
32
+ """
33
+ Close the environment and clean up any resources.
34
+ This method should be called when the environment is no longer needed.
35
+ """
@@ -0,0 +1,306 @@
1
+ from __future__ import annotations
2
+
3
+ import abc
4
+ import json
5
+ import logging
6
+ import os
7
+ import uuid
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ import toml
12
+
13
+ from hud.env.client import Client
14
+ from hud.types import EnvironmentStatus
15
+ from hud.utils.common import directory_to_tar_bytes
16
+
17
+ if TYPE_CHECKING:
18
+ from hud.utils import ExecuteResult
19
+ from hud.utils.config import HudStyleConfig
20
+
21
+ logger = logging.getLogger("hud.env.docker_client")
22
+
23
+ STATUS_MESSAGES = {
24
+ EnvironmentStatus.RUNNING.value: "is running",
25
+ EnvironmentStatus.ERROR.value: "had an error initializing",
26
+ EnvironmentStatus.COMPLETED.value: "completed",
27
+ }
28
+
29
+
30
+ class InvokeError(Exception):
31
+ """
32
+ Error raised when an invoke fails.
33
+ """
34
+
35
+
36
+ def invoke_template(config: HudStyleConfig, package_name: str, divider: str) -> str:
37
+ """
38
+ Return a python script to run the given config.
39
+ """
40
+ func_parts = config.function.split(".")
41
+ module_str = ".".join([package_name] + func_parts[:-1])
42
+ func_str = func_parts[-1]
43
+
44
+ # the reason we call `json.dumps` twice is to escape the json string
45
+ return f"""import json
46
+ from {module_str} import {func_str}
47
+ args = json.loads({json.dumps(json.dumps(config.args))})
48
+ result = {func_str}(*args)
49
+ result_str = json.dumps(result)
50
+ print("{divider}")
51
+ print(result_str)
52
+ """
53
+
54
+ class DockerClient(Client):
55
+ """
56
+ Base class for environment clients.
57
+
58
+ Handles updating the environment when local files change.
59
+ """
60
+
61
+ _last_pyproject_toml_str: str | None = None
62
+ _last_update_time: int = 0
63
+ _last_file_mtimes: dict[str, float] = {} # noqa: RUF012
64
+ _source_path: Path | None = None
65
+ _package_name: str | None = None
66
+
67
+ @property
68
+ def source_path(self) -> Path | None:
69
+ """Get the source path."""
70
+ return self._source_path
71
+
72
+ @property
73
+ def package_name(self) -> str:
74
+ """Get the package name."""
75
+ if not self._package_name:
76
+ raise ValueError("Package name not set")
77
+ return self._package_name
78
+
79
+
80
+ def set_source_path(self, source_path: Path) -> None:
81
+ """
82
+ Set the source path for this environment controller.
83
+ Can only be set once, and cannot be set if source_path is already set.
84
+
85
+ Args:
86
+ source_path: Path to the source code to use in the environment
87
+
88
+ Raises:
89
+ ValueError: If source_path has already been set
90
+ """
91
+ if self._source_path:
92
+ raise ValueError("Source path has already been set")
93
+
94
+ # Validate source path
95
+ if not source_path.exists():
96
+ raise FileNotFoundError(f"Source path {source_path} does not exist")
97
+ if not source_path.is_dir():
98
+ raise NotADirectoryError(f"Source path {source_path} is not a directory")
99
+
100
+ # Parse pyproject.toml to get package name
101
+ pyproject_path = source_path / "pyproject.toml"
102
+ if not pyproject_path.exists():
103
+ raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
104
+
105
+ pyproject_data = toml.load(pyproject_path)
106
+ self._package_name = pyproject_data.get("project", {}).get("name")
107
+ if not self._package_name:
108
+ raise ValueError("Could not find package name in pyproject.toml")
109
+
110
+ self._source_path = source_path
111
+
112
+ @classmethod
113
+ @abc.abstractmethod
114
+ async def create(cls, dockerfile: str) -> DockerClient:
115
+ """
116
+ Creates an environment client from a dockerfile.
117
+
118
+ Args:
119
+ dockerfile: The dockerfile content to build the environment
120
+
121
+ Returns:
122
+ EnvClient: An instance of the environment client
123
+ """
124
+
125
+ @abc.abstractmethod
126
+ async def get_status(self) -> EnvironmentStatus:
127
+ """
128
+ Get the current status of the environment.
129
+
130
+ Returns:
131
+ EnvironmentStatus: A status enum indicating the current state of the environment
132
+ """
133
+
134
+ def _get_all_file_mtimes(self) -> dict[str, float]:
135
+ """
136
+ Get modification times for all files in the source path.
137
+
138
+ Returns:
139
+ Dict[str, float]: Dictionary mapping file paths to modification times
140
+ """
141
+ if not self._source_path:
142
+ return {}
143
+
144
+ file_mtimes = {}
145
+ for root, _, files in os.walk(self._source_path):
146
+ for file in files:
147
+ file_path = Path(root) / file
148
+ try:
149
+ file_mtimes[str(file_path)] = file_path.stat().st_mtime
150
+ except (FileNotFoundError, PermissionError):
151
+ # Skip files that can't be accessed
152
+ continue
153
+ return file_mtimes
154
+
155
+ async def needs_update(self) -> bool:
156
+ """
157
+ Check if the environment needs an update by:
158
+ 1. Checking if any file has been modified since the last update
159
+
160
+ Returns:
161
+ bool: True if the environment needs an update, False otherwise.
162
+ """
163
+ # If no source path, no update needed
164
+ if not self.source_path:
165
+ return False
166
+
167
+ # Check if any file has been modified since the last update
168
+ current_mtimes = self._get_all_file_mtimes()
169
+
170
+ # If we don't have previous modification times, we need an update
171
+ if not self._last_file_mtimes:
172
+ return True
173
+
174
+ # Check for new or modified files
175
+ for file_path, mtime in current_mtimes.items():
176
+ if file_path not in self._last_file_mtimes or mtime > self._last_file_mtimes[file_path]:
177
+ return True
178
+
179
+ return False
180
+
181
+ async def update(self) -> None:
182
+ """
183
+ Base update method for environment controllers.
184
+ For controllers with no source path, this is a no-op.
185
+ """
186
+ # If no source path, nothing to update
187
+ if not self._source_path:
188
+ return
189
+
190
+ logger.info("Updating environment")
191
+
192
+ # Save current file modification times
193
+ self._last_file_mtimes = self._get_all_file_mtimes()
194
+
195
+ # Create tar archive of the source code and send it to the container
196
+ tar_bytes = directory_to_tar_bytes(self._source_path)
197
+ await self.execute(["mkdir", "-p", "/root/controller"], timeout=5)
198
+ await self.put_archive("/root/controller", tar_bytes)
199
+
200
+ # Check if pyproject.toml exists and parse it
201
+ pyproject_path = self._source_path / "pyproject.toml"
202
+ if not pyproject_path.exists():
203
+ raise FileNotFoundError(f"pyproject.toml not found in {self._source_path}")
204
+
205
+ # Read and parse the current content of pyproject.toml
206
+ current_pyproject_content = pyproject_path.read_text()
207
+ if (
208
+ self._last_pyproject_toml_str is None
209
+ or self._last_pyproject_toml_str != current_pyproject_content
210
+ ):
211
+ # Update package name if pyproject.toml changed
212
+ pyproject_data = toml.loads(current_pyproject_content)
213
+ self._package_name = pyproject_data.get("project", {}).get("name")
214
+ if not self._package_name:
215
+ raise ValueError("Could not find package name in pyproject.toml")
216
+ logger.info("Installing %s in /root/controller", self._package_name)
217
+ result = await self.execute(
218
+ ["bash", "-c", "cd /root/controller && pip install -e ."],
219
+ timeout=60,
220
+ )
221
+ if result["stdout"]:
222
+ logger.info("STDOUT:\n%s", result["stdout"])
223
+ if result["stderr"]:
224
+ logger.warning("STDERR:\n%s", result["stderr"])
225
+ # Save current pyproject.toml content
226
+ self._last_pyproject_toml_str = current_pyproject_content
227
+
228
+
229
+ @abc.abstractmethod
230
+ async def execute(
231
+ self,
232
+ command: list[str],
233
+ *,
234
+ timeout: int | None = None,
235
+ ) -> ExecuteResult:
236
+ """
237
+ Execute a command in the environment. May not be supported by all environments.
238
+
239
+ Args:
240
+ command: The command to execute
241
+ workdir: The working directory to execute the command in
242
+ timeout: The timeout for the command
243
+
244
+ Returns:
245
+ ExecuteResult: The result of the command
246
+ """
247
+
248
+ async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
249
+ """
250
+ Invoke a function in the environment. Supported by all environments.
251
+
252
+ Args:
253
+ config: The configuration to invoke
254
+
255
+ Returns:
256
+ tuple[Any, bytes, bytes]: The result of the invocation, stdout, and stderr
257
+ """
258
+
259
+ if await self.needs_update():
260
+ logger.info("Environment needs update, updating")
261
+ await self.update()
262
+
263
+ # generate a random uuid as a divider
264
+ divider = str(uuid.uuid4())
265
+
266
+ template = invoke_template(config, self.package_name, divider)
267
+ logger.debug("Invoking template: %s", template)
268
+
269
+ result = await self.execute(["python3", "-c", template])
270
+
271
+ # parse the result
272
+ # we take the whole stderr as the stderr, and the stdout is the result pre-divider
273
+ stderr = result["stderr"]
274
+ stdout_parts = result["stdout"].split(divider.encode())
275
+ stdout = stdout_parts[0]
276
+
277
+ # parse the json part of the stdout (if it exists)
278
+ if len(stdout_parts) > 1:
279
+ result = json.loads(stdout_parts[1])
280
+ else:
281
+ raise InvokeError(stdout, stderr)
282
+
283
+ return result, stdout, stderr
284
+
285
+ @abc.abstractmethod
286
+ async def get_archive(self, path: str) -> bytes:
287
+ """
288
+ Get an archive of a path from the environment.
289
+ May not be supported by all environments. (notably browser environments)
290
+ Args:
291
+ path: The path to get the archive of
292
+
293
+ Returns:
294
+ bytes: The archive of the path
295
+ """
296
+
297
+ @abc.abstractmethod
298
+ async def put_archive(self, path: str, data: bytes) -> bool:
299
+ """
300
+ Put an archive of data at a path in the environment.
301
+ May not be supported by all environments. (notably browser environments)
302
+ Args:
303
+ path: The path to put the archive at
304
+ data: The data to put in the archive
305
+ """
306
+