hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show
  1. hud/__init__.py +5 -3
  2. hud/adapters/__init__.py +2 -1
  3. hud/adapters/claude/adapter.py +13 -17
  4. hud/adapters/common/adapter.py +3 -3
  5. hud/adapters/common/tests/__init__.py +0 -0
  6. hud/adapters/common/tests/test_adapter.py +277 -0
  7. hud/adapters/common/types.py +3 -6
  8. hud/adapters/operator/adapter.py +22 -29
  9. hud/agent/__init__.py +9 -1
  10. hud/agent/base.py +28 -28
  11. hud/agent/claude.py +69 -60
  12. hud/agent/langchain.py +204 -0
  13. hud/agent/operator.py +75 -67
  14. hud/env/__init__.py +5 -5
  15. hud/env/client.py +2 -2
  16. hud/env/docker_client.py +37 -39
  17. hud/env/environment.py +91 -66
  18. hud/env/local_docker_client.py +5 -7
  19. hud/env/remote_client.py +40 -29
  20. hud/env/remote_docker_client.py +13 -3
  21. hud/evaluators/__init__.py +2 -3
  22. hud/evaluators/base.py +4 -3
  23. hud/evaluators/inspect.py +3 -8
  24. hud/evaluators/judge.py +34 -58
  25. hud/evaluators/match.py +42 -49
  26. hud/evaluators/remote.py +13 -26
  27. hud/evaluators/tests/__init__.py +0 -0
  28. hud/evaluators/tests/test_inspect.py +12 -0
  29. hud/evaluators/tests/test_judge.py +231 -0
  30. hud/evaluators/tests/test_match.py +115 -0
  31. hud/evaluators/tests/test_remote.py +98 -0
  32. hud/exceptions.py +167 -0
  33. hud/gym.py +12 -10
  34. hud/job.py +525 -47
  35. hud/server/__init__.py +2 -2
  36. hud/server/requests.py +148 -186
  37. hud/server/tests/__init__.py +0 -0
  38. hud/server/tests/test_requests.py +275 -0
  39. hud/settings.py +3 -2
  40. hud/task.py +12 -22
  41. hud/taskset.py +44 -11
  42. hud/trajectory.py +6 -9
  43. hud/types.py +14 -9
  44. hud/utils/__init__.py +2 -2
  45. hud/utils/common.py +37 -13
  46. hud/utils/config.py +44 -29
  47. hud/utils/progress.py +149 -0
  48. hud/utils/telemetry.py +10 -11
  49. hud/utils/tests/__init__.py +0 -0
  50. hud/utils/tests/test_common.py +52 -0
  51. hud/utils/tests/test_config.py +129 -0
  52. hud/utils/tests/test_progress.py +225 -0
  53. hud/utils/tests/test_telemetry.py +37 -0
  54. hud/utils/tests/test_version.py +8 -0
  55. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
  56. hud_python-0.2.3.dist-info/RECORD +62 -0
  57. hud_python-0.2.1.dist-info/RECORD +0 -44
  58. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
  59. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/agent/operator.py CHANGED
@@ -10,36 +10,37 @@ from openai.types.responses import (
10
10
  ResponseInputItemParam,
11
11
  ResponseOutputMessage,
12
12
  ResponseComputerToolCall,
13
- ResponseOutputText
13
+ ResponseOutputText,
14
14
  )
15
15
 
16
16
  from hud.adapters import Adapter
17
17
  from hud.agent.base import Agent
18
18
  from hud.adapters.operator import OperatorAdapter
19
- from hud.env.environment import Observation
19
+ from hud.utils.common import Observation
20
20
  from hud.settings import settings
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
24
+
24
25
  class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
25
26
  """
26
27
  An agent implementation using OpenAI's Computer Use API.
27
-
28
+
28
29
  This agent interacts with HUD environments using OpenAI's Computer Use API
29
30
  through the OperatorAdapter which converts actions to the format expected by HUD.
30
31
  """
31
-
32
+
32
33
  def __init__(
33
- self,
34
+ self,
34
35
  client: OpenAI | None = None,
35
36
  model: str = "computer-use-preview",
36
37
  environment: Literal["windows", "mac", "linux", "browser"] = "windows",
37
38
  adapter: Adapter | None = None,
38
- max_iterations: int = 8
39
+ max_iterations: int = 8,
39
40
  ):
40
41
  """
41
42
  Initialize the OperatorAgent.
42
-
43
+
43
44
  Args:
44
45
  client: The OpenAI client for API calls (optional, created automatically if not provided)
45
46
  model: The model to use for computer use
@@ -52,28 +53,30 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
52
53
  # Get API key from settings
53
54
  api_key = settings.openai_api_key
54
55
  if not api_key:
55
- raise ValueError("OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY.")
56
-
56
+ raise ValueError(
57
+ "OpenAI API key not found in settings or environment variables. Set OPENAI_API_KEY."
58
+ )
59
+
57
60
  # Create synchronous client
58
61
  client = OpenAI(api_key=api_key)
59
62
 
60
63
  adapter = adapter or OperatorAdapter()
61
-
64
+
62
65
  super().__init__(client=client, adapter=adapter)
63
-
66
+
64
67
  self.model = model
65
68
  self.environment = environment
66
69
  self.max_iterations = max_iterations
67
-
70
+
68
71
  # Default dimensions
69
72
  self.width = 1024
70
73
  self.height = 768
71
-
74
+
72
75
  # Update dimensions if adapter is provided
73
76
  if self.adapter:
74
77
  self.width = self.adapter.agent_width
75
78
  self.height = self.adapter.agent_height
76
-
79
+
77
80
  # Message history and state tracking
78
81
  self.last_response_id = None
79
82
  self.pending_call_id = None
@@ -82,86 +85,91 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
82
85
  async def fetch_response(self, observation: Observation) -> tuple[list[dict[str, Any]], bool]:
83
86
  """
84
87
  Fetch a response from the model based on the observation.
85
-
88
+
86
89
  Args:
87
90
  observation: The preprocessed observation
88
-
91
+
89
92
  Returns:
90
93
  tuple[list[dict[str, Any]], bool]: A tuple containing the list of raw actions and a
91
94
  boolean indicating if the agent believes the task is complete
92
95
  """
93
96
  if not self.client:
94
97
  raise ValueError("Client is required")
95
-
98
+
96
99
  # Define the computer use tool with correct type using cast
97
- computer_tool = cast(ToolParam, {
98
- "type": "computer_use_preview",
99
- "display_width": self.width,
100
- "display_height": self.height,
101
- "environment": self.environment
102
- })
103
-
100
+ computer_tool = cast(
101
+ ToolParam,
102
+ {
103
+ "type": "computer_use_preview",
104
+ "display_width": self.width,
105
+ "display_height": self.height,
106
+ "environment": self.environment,
107
+ },
108
+ )
109
+
104
110
  # Process the observation based on whether it's the first one or a response to an action
105
111
  if self.pending_call_id is None and self.last_response_id is None:
106
112
  # This is the first observation, store and send the prompt
107
113
  self.initial_prompt = observation.text
108
-
114
+
109
115
  # Create the initial request following the required structure
110
116
  input_content: list[dict[str, Any]] = [
111
117
  {"type": "input_text", "text": observation.text or ""}
112
118
  ]
113
-
119
+
114
120
  # Add screenshot if present
115
121
  if observation.screenshot:
116
- input_content.append({
117
- "type": "input_image",
118
- "image_url": f"data:image/png;base64,{observation.screenshot}"
119
- })
120
-
122
+ input_content.append(
123
+ {
124
+ "type": "input_image",
125
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
126
+ }
127
+ )
128
+
121
129
  # Structure the input correctly for the API using cast
122
- input_param = cast(ResponseInputParam, [{
123
- "role": "user",
124
- "content": input_content
125
- }])
126
-
130
+ input_param = cast(ResponseInputParam, [{"role": "user", "content": input_content}])
131
+
127
132
  # Call OpenAI API for the initial prompt (synchronous call)
128
133
  response = self.client.responses.create(
129
- model=self.model,
130
- tools=[computer_tool],
131
- input=input_param,
132
- truncation="auto"
134
+ model=self.model, tools=[computer_tool], input=input_param, truncation="auto"
133
135
  )
134
-
136
+
135
137
  else:
136
138
  # This is a response to a previous action
137
139
  if not observation.screenshot:
138
140
  logger.warning("No screenshot provided for response to action")
139
141
  return [], True
140
-
142
+
141
143
  # Create a response to the previous action with the new screenshot
142
- input_param_followup = cast(ResponseInputParam, [
143
- cast(ResponseInputItemParam, {
144
- "call_id": self.pending_call_id,
145
- "type": "computer_call_output",
146
- "output": {
147
- "type": "input_image",
148
- "image_url": f"data:image/png;base64,{observation.screenshot}"
149
- }
150
- })
151
- ])
152
-
144
+ input_param_followup = cast(
145
+ ResponseInputParam,
146
+ [
147
+ cast(
148
+ ResponseInputItemParam,
149
+ {
150
+ "call_id": self.pending_call_id,
151
+ "type": "computer_call_output",
152
+ "output": {
153
+ "type": "input_image",
154
+ "image_url": f"data:image/png;base64,{observation.screenshot}",
155
+ },
156
+ },
157
+ )
158
+ ],
159
+ )
160
+
153
161
  # Call OpenAI API for follow-up (synchronous call)
154
162
  response = self.client.responses.create(
155
163
  model=self.model,
156
164
  previous_response_id=self.last_response_id,
157
165
  tools=[computer_tool],
158
166
  input=input_param_followup,
159
- truncation="auto"
167
+ truncation="auto",
160
168
  )
161
-
169
+
162
170
  # Store the response ID for the next call
163
171
  self.last_response_id = response.id
164
-
172
+
165
173
  # Process the response to extract actions or final text
166
174
  actions = []
167
175
  done = True # Assume done unless a computer call is found
@@ -169,17 +177,18 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
169
177
 
170
178
  # Check for computer calls first
171
179
  computer_calls = [
172
- item for item in response.output
180
+ item
181
+ for item in response.output
173
182
  if isinstance(item, ResponseComputerToolCall) and item.type == "computer_call"
174
183
  ]
175
-
184
+
176
185
  if computer_calls:
177
186
  # If computer calls exist, process them and set done=False
178
187
  done = False
179
188
  for computer_call in computer_calls:
180
189
  self.pending_call_id = computer_call.call_id
181
190
  action = computer_call.action
182
- actions.append(action.model_dump()) # Convert Pydantic model to dict
191
+ actions.append(action.model_dump()) # Convert Pydantic model to dict
183
192
  logger.info(f"Computer call action: {action}")
184
193
  else:
185
194
  # No computer calls, check for a final text message
@@ -188,21 +197,20 @@ class OperatorAgent(Agent[OpenAI, dict[str, Any]]):
188
197
  for item in response.output:
189
198
  if isinstance(item, ResponseOutputMessage) and item.type == "message":
190
199
  # Extract text from content blocks within the message
191
- full_text = "".join([c.text for c in item.content if isinstance(c, ResponseOutputText)])
200
+ full_text = "".join(
201
+ [c.text for c in item.content if isinstance(c, ResponseOutputText)]
202
+ )
192
203
  if full_text:
193
204
  final_text_response = full_text
194
205
  logger.info(f"Final text message: {final_text_response}")
195
- break # Stop after finding the first text message
196
-
206
+ break # Stop after finding the first text message
207
+
197
208
  # If we found final text, package it as a 'response' action
198
209
  if final_text_response:
199
- actions = [{
200
- "type": "response",
201
- "text": final_text_response
202
- }]
210
+ actions = [{"type": "response", "text": final_text_response}]
203
211
  # Keep done = True
204
212
  else:
205
213
  logger.info("No computer calls and no final text message found.")
206
214
  # Keep done = True, actions remains empty
207
215
 
208
- return actions, done
216
+ return actions, done
hud/env/__init__.py CHANGED
@@ -3,9 +3,9 @@ from __future__ import annotations
3
3
  from . import docker_client, environment, local_docker_client, remote_client, remote_docker_client
4
4
 
5
5
  __all__ = [
6
- "docker_client",
7
- "environment",
8
- "local_docker_client",
9
- "remote_client",
10
- "remote_docker_client",
6
+ "docker_client",
7
+ "environment",
8
+ "local_docker_client",
9
+ "remote_client",
10
+ "remote_docker_client",
11
11
  ]
hud/env/client.py CHANGED
@@ -7,7 +7,7 @@ from pydantic import BaseModel
7
7
 
8
8
  if TYPE_CHECKING:
9
9
  from hud.types import EnvironmentStatus
10
- from hud.utils.config import HudStyleConfig
10
+ from hud.utils.config import FunctionConfig
11
11
 
12
12
 
13
13
  class Client(BaseModel, ABC):
@@ -16,7 +16,7 @@ class Client(BaseModel, ABC):
16
16
  """
17
17
 
18
18
  @abstractmethod
19
- async def invoke(self, config: HudStyleConfig) -> Any:
19
+ async def invoke(self, config: FunctionConfig) -> Any:
20
20
  """
21
21
  Invoke the environment with the given config.
22
22
  """
hud/env/docker_client.py CHANGED
@@ -16,7 +16,7 @@ from hud.utils.common import directory_to_tar_bytes
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from hud.utils import ExecuteResult
19
- from hud.utils.config import HudStyleConfig
19
+ from hud.utils.config import FunctionConfig
20
20
 
21
21
  logger = logging.getLogger("hud.env.docker_client")
22
22
 
@@ -33,7 +33,7 @@ class InvokeError(Exception):
33
33
  """
34
34
 
35
35
 
36
- def invoke_template(config: HudStyleConfig, package_name: str, divider: str) -> str:
36
+ def invoke_template(config: FunctionConfig, package_name: str, divider: str) -> str:
37
37
  """
38
38
  Return a python script to run the given config.
39
39
  """
@@ -51,16 +51,17 @@ print("{divider}")
51
51
  print(result_str)
52
52
  """
53
53
 
54
+
54
55
  class DockerClient(Client):
55
56
  """
56
57
  Base class for environment clients.
57
-
58
+
58
59
  Handles updating the environment when local files change.
59
60
  """
60
-
61
+
61
62
  _last_pyproject_toml_str: str | None = None
62
63
  _last_update_time: int = 0
63
- _last_file_mtimes: dict[str, float] = {} # noqa: RUF012
64
+ _last_file_mtimes: dict[str, float] = {} # noqa: RUF012 - Not recognized as Pydantic model
64
65
  _source_path: Path | None = None
65
66
  _package_name: str | None = None
66
67
 
@@ -68,47 +69,46 @@ class DockerClient(Client):
68
69
  def source_path(self) -> Path | None:
69
70
  """Get the source path."""
70
71
  return self._source_path
71
-
72
+
72
73
  @property
73
74
  def package_name(self) -> str:
74
75
  """Get the package name."""
75
76
  if not self._package_name:
76
77
  raise ValueError("Package name not set")
77
78
  return self._package_name
78
-
79
79
 
80
80
  def set_source_path(self, source_path: Path) -> None:
81
81
  """
82
82
  Set the source path for this environment controller.
83
83
  Can only be set once, and cannot be set if source_path is already set.
84
-
84
+
85
85
  Args:
86
86
  source_path: Path to the source code to use in the environment
87
-
87
+
88
88
  Raises:
89
89
  ValueError: If source_path has already been set
90
90
  """
91
91
  if self._source_path:
92
92
  raise ValueError("Source path has already been set")
93
-
93
+
94
94
  # Validate source path
95
95
  if not source_path.exists():
96
96
  raise FileNotFoundError(f"Source path {source_path} does not exist")
97
97
  if not source_path.is_dir():
98
98
  raise NotADirectoryError(f"Source path {source_path} is not a directory")
99
-
99
+
100
100
  # Parse pyproject.toml to get package name
101
101
  pyproject_path = source_path / "pyproject.toml"
102
102
  if not pyproject_path.exists():
103
103
  raise FileNotFoundError(f"pyproject.toml not found in {source_path}")
104
-
104
+
105
105
  pyproject_data = toml.load(pyproject_path)
106
106
  self._package_name = pyproject_data.get("project", {}).get("name")
107
107
  if not self._package_name:
108
108
  raise ValueError("Could not find package name in pyproject.toml")
109
-
109
+
110
110
  self._source_path = source_path
111
-
111
+
112
112
  @classmethod
113
113
  @abc.abstractmethod
114
114
  async def create(cls, dockerfile: str) -> DockerClient:
@@ -121,26 +121,26 @@ class DockerClient(Client):
121
121
  Returns:
122
122
  EnvClient: An instance of the environment client
123
123
  """
124
-
124
+
125
125
  @abc.abstractmethod
126
126
  async def get_status(self) -> EnvironmentStatus:
127
127
  """
128
128
  Get the current status of the environment.
129
-
129
+
130
130
  Returns:
131
131
  EnvironmentStatus: A status enum indicating the current state of the environment
132
132
  """
133
-
133
+
134
134
  def _get_all_file_mtimes(self) -> dict[str, float]:
135
135
  """
136
136
  Get modification times for all files in the source path.
137
-
137
+
138
138
  Returns:
139
139
  Dict[str, float]: Dictionary mapping file paths to modification times
140
140
  """
141
141
  if not self._source_path:
142
142
  return {}
143
-
143
+
144
144
  file_mtimes = {}
145
145
  for root, _, files in os.walk(self._source_path):
146
146
  for file in files:
@@ -151,12 +151,12 @@ class DockerClient(Client):
151
151
  # Skip files that can't be accessed
152
152
  continue
153
153
  return file_mtimes
154
-
154
+
155
155
  async def needs_update(self) -> bool:
156
156
  """
157
157
  Check if the environment needs an update by:
158
158
  1. Checking if any file has been modified since the last update
159
-
159
+
160
160
  Returns:
161
161
  bool: True if the environment needs an update, False otherwise.
162
162
  """
@@ -166,18 +166,18 @@ class DockerClient(Client):
166
166
 
167
167
  # Check if any file has been modified since the last update
168
168
  current_mtimes = self._get_all_file_mtimes()
169
-
169
+
170
170
  # If we don't have previous modification times, we need an update
171
171
  if not self._last_file_mtimes:
172
172
  return True
173
-
173
+
174
174
  # Check for new or modified files
175
175
  for file_path, mtime in current_mtimes.items():
176
176
  if file_path not in self._last_file_mtimes or mtime > self._last_file_mtimes[file_path]:
177
177
  return True
178
-
178
+
179
179
  return False
180
-
180
+
181
181
  async def update(self) -> None:
182
182
  """
183
183
  Base update method for environment controllers.
@@ -186,22 +186,22 @@ class DockerClient(Client):
186
186
  # If no source path, nothing to update
187
187
  if not self._source_path:
188
188
  return
189
-
189
+
190
190
  logger.info("Updating environment")
191
191
 
192
192
  # Save current file modification times
193
193
  self._last_file_mtimes = self._get_all_file_mtimes()
194
-
194
+
195
195
  # Create tar archive of the source code and send it to the container
196
196
  tar_bytes = directory_to_tar_bytes(self._source_path)
197
197
  await self.execute(["mkdir", "-p", "/root/controller"], timeout=5)
198
198
  await self.put_archive("/root/controller", tar_bytes)
199
-
199
+
200
200
  # Check if pyproject.toml exists and parse it
201
201
  pyproject_path = self._source_path / "pyproject.toml"
202
202
  if not pyproject_path.exists():
203
203
  raise FileNotFoundError(f"pyproject.toml not found in {self._source_path}")
204
-
204
+
205
205
  # Read and parse the current content of pyproject.toml
206
206
  current_pyproject_content = pyproject_path.read_text()
207
207
  if (
@@ -224,8 +224,7 @@ class DockerClient(Client):
224
224
  logger.warning("STDERR:\n%s", result["stderr"])
225
225
  # Save current pyproject.toml content
226
226
  self._last_pyproject_toml_str = current_pyproject_content
227
-
228
-
227
+
229
228
  @abc.abstractmethod
230
229
  async def execute(
231
230
  self,
@@ -235,20 +234,20 @@ class DockerClient(Client):
235
234
  ) -> ExecuteResult:
236
235
  """
237
236
  Execute a command in the environment. May not be supported by all environments.
238
-
237
+
239
238
  Args:
240
239
  command: The command to execute
241
240
  workdir: The working directory to execute the command in
242
241
  timeout: The timeout for the command
243
-
242
+
244
243
  Returns:
245
244
  ExecuteResult: The result of the command
246
245
  """
247
-
248
- async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
246
+
247
+ async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
249
248
  """
250
249
  Invoke a function in the environment. Supported by all environments.
251
-
250
+
252
251
  Args:
253
252
  config: The configuration to invoke
254
253
 
@@ -289,11 +288,11 @@ class DockerClient(Client):
289
288
  May not be supported by all environments. (notably browser environments)
290
289
  Args:
291
290
  path: The path to get the archive of
292
-
291
+
293
292
  Returns:
294
293
  bytes: The archive of the path
295
294
  """
296
-
295
+
297
296
  @abc.abstractmethod
298
297
  async def put_archive(self, path: str, data: bytes) -> bool:
299
298
  """
@@ -303,4 +302,3 @@ class DockerClient(Client):
303
302
  path: The path to put the archive at
304
303
  data: The data to put in the archive
305
304
  """
306
-