hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (58) hide show
  1. hud/__init__.py +4 -3
  2. hud/adapters/claude/adapter.py +5 -14
  3. hud/adapters/common/adapter.py +3 -3
  4. hud/adapters/common/tests/__init__.py +0 -0
  5. hud/adapters/common/tests/test_adapter.py +277 -0
  6. hud/adapters/common/types.py +3 -3
  7. hud/adapters/operator/adapter.py +16 -23
  8. hud/agent/__init__.py +8 -1
  9. hud/agent/base.py +28 -28
  10. hud/agent/claude.py +69 -60
  11. hud/agent/langchain.py +32 -26
  12. hud/agent/operator.py +75 -67
  13. hud/env/__init__.py +5 -5
  14. hud/env/client.py +2 -2
  15. hud/env/docker_client.py +37 -39
  16. hud/env/environment.py +91 -66
  17. hud/env/local_docker_client.py +5 -7
  18. hud/env/remote_client.py +39 -32
  19. hud/env/remote_docker_client.py +13 -3
  20. hud/evaluators/__init__.py +2 -3
  21. hud/evaluators/base.py +4 -3
  22. hud/evaluators/inspect.py +3 -8
  23. hud/evaluators/judge.py +34 -58
  24. hud/evaluators/match.py +42 -49
  25. hud/evaluators/remote.py +13 -26
  26. hud/evaluators/tests/__init__.py +0 -0
  27. hud/evaluators/tests/test_inspect.py +12 -0
  28. hud/evaluators/tests/test_judge.py +231 -0
  29. hud/evaluators/tests/test_match.py +115 -0
  30. hud/evaluators/tests/test_remote.py +98 -0
  31. hud/exceptions.py +167 -0
  32. hud/gym.py +9 -7
  33. hud/job.py +179 -109
  34. hud/server/__init__.py +2 -2
  35. hud/server/requests.py +148 -186
  36. hud/server/tests/__init__.py +0 -0
  37. hud/server/tests/test_requests.py +275 -0
  38. hud/settings.py +3 -2
  39. hud/task.py +9 -19
  40. hud/taskset.py +44 -11
  41. hud/trajectory.py +6 -9
  42. hud/types.py +12 -9
  43. hud/utils/__init__.py +2 -2
  44. hud/utils/common.py +36 -15
  45. hud/utils/config.py +45 -30
  46. hud/utils/progress.py +34 -21
  47. hud/utils/telemetry.py +10 -11
  48. hud/utils/tests/__init__.py +0 -0
  49. hud/utils/tests/test_common.py +52 -0
  50. hud/utils/tests/test_config.py +129 -0
  51. hud/utils/tests/test_progress.py +225 -0
  52. hud/utils/tests/test_telemetry.py +37 -0
  53. hud/utils/tests/test_version.py +8 -0
  54. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
  55. hud_python-0.2.4.dist-info/RECORD +62 -0
  56. hud_python-0.2.2.dist-info/RECORD +0 -46
  57. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
  58. {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/env/remote_client.py CHANGED
@@ -5,23 +5,25 @@ from base64 import b64decode
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
7
  from hud.env.client import Client
8
+ from hud.exceptions import HudResponseError
8
9
  from hud.server import make_request
9
10
  from hud.settings import settings
10
11
  from hud.types import EnvironmentStatus
11
12
  from hud.utils import ExecuteResult
12
13
 
13
14
  if TYPE_CHECKING:
14
- from hud.utils.config import HudStyleConfig
15
+ from hud.utils.config import FunctionConfig
15
16
 
16
17
  logger = logging.getLogger("hud.env.remote_env_client")
17
18
 
19
+
18
20
  class RemoteClient(Client):
19
21
  """
20
22
  Remote environment client implementation.
21
-
23
+
22
24
  Uses the HUD API to manage a remote environment.
23
25
  """
24
-
26
+
25
27
  @classmethod
26
28
  async def create(
27
29
  cls,
@@ -33,21 +35,23 @@ class RemoteClient(Client):
33
35
  ) -> tuple[RemoteClient, dict[str, Any]]:
34
36
  """
35
37
  Creates a remote environment client from a dockerfile or gym_id.
36
-
38
+
37
39
  Args:
38
40
  dockerfile: The dockerfile content to build the environment
39
41
  gym_id: The gym_id of the environment to create
40
42
  metadata: Metadata to associate with the environment
41
-
43
+
42
44
  Returns:
43
- RemoteClient: An instance of the remote environment client
45
+ A tuple containing the remote environment client and the build metadata
46
+
47
+ Raises:
48
+ HudResponseError: If the environment creation is successful but the response is invalid.
44
49
  """
45
50
 
46
51
  # Validate arguments
47
52
  if metadata is None:
48
53
  metadata = {}
49
54
 
50
-
51
55
  request_data = {
52
56
  # still named run_id for backwards compatibility
53
57
  "run_id": job_id,
@@ -63,33 +67,38 @@ class RemoteClient(Client):
63
67
  json=request_data,
64
68
  api_key=settings.api_key,
65
69
  )
66
-
70
+
67
71
  # Get the environment ID from the response
68
72
  env_id = response.get("id")
69
73
  if not env_id:
70
- raise ValueError("Failed to create remote environment: No ID returned")
71
-
74
+ raise HudResponseError(
75
+ message="Failed to create remote environment: No ID returned in API response. "
76
+ "Please contact support if this issue persists.",
77
+ response_json=response,
78
+ )
79
+
72
80
  # Create the controller instance
73
81
  controller = cls(env_id)
74
-
82
+
75
83
  build_data = response.get("metadata", {})
76
-
84
+
77
85
  if response.get("readme"):
78
- logger.info("[HUD] %s gym created, see how to use it at %s", gym_id,
79
- response.get("readme"))
80
-
86
+ logger.info(
87
+ "[HUD] %s gym created, see how to use it at %s", gym_id, response.get("readme")
88
+ )
89
+
81
90
  return controller, build_data
82
91
 
83
92
  def __init__(self, env_id: str) -> None:
84
93
  """
85
94
  Initialize the RemoteClient.
86
-
95
+
87
96
  Args:
88
97
  env_id: ID of the remote environment to control
89
98
  """
90
99
  super().__init__()
91
100
  self._env_id = env_id
92
-
101
+
93
102
  @property
94
103
  def env_id(self) -> str:
95
104
  """The ID of the remote environment."""
@@ -98,7 +107,7 @@ class RemoteClient(Client):
98
107
  async def get_status(self) -> EnvironmentStatus:
99
108
  """
100
109
  Get the current status of the remote environment.
101
-
110
+
102
111
  Returns:
103
112
  EnvironmentStatus: The current status of the environment
104
113
  """
@@ -111,7 +120,7 @@ class RemoteClient(Client):
111
120
  logger.debug("Environment status response: %s", response)
112
121
 
113
122
  status = response.get("state", "").lower()
114
-
123
+
115
124
  if status == "running":
116
125
  return EnvironmentStatus.RUNNING
117
126
  elif status == "initializing" or status == "pending":
@@ -122,12 +131,12 @@ class RemoteClient(Client):
122
131
  # Any other status is considered an error
123
132
  logger.warning("Abnormal environment status response: %s", response)
124
133
  return EnvironmentStatus.ERROR
125
-
134
+
126
135
  except Exception:
127
136
  # If we can't connect to the API or there's any other error
128
137
  logger.info("(potentially transient) Error getting environment status")
129
138
  return EnvironmentStatus.ERROR
130
-
139
+
131
140
  async def execute(
132
141
  self,
133
142
  command: list[str],
@@ -138,11 +147,11 @@ class RemoteClient(Client):
138
147
  """
139
148
  Execute a command in the environment.
140
149
  No-op in some environments (like browser use).
141
-
150
+
142
151
  Args:
143
152
  command: Command to execute
144
153
  workdir: Working directory for the command (ignored for remote environments)
145
-
154
+
146
155
  Returns:
147
156
  ExecuteResult: Result of the command execution
148
157
  """
@@ -150,21 +159,20 @@ class RemoteClient(Client):
150
159
  method="POST",
151
160
  url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
152
161
  json={
153
- "command": command,
154
- "workdir": workdir,
155
- "timeout": timeout,
162
+ "command": command,
163
+ "workdir": workdir,
164
+ "timeout": timeout,
156
165
  },
157
166
  api_key=settings.api_key,
158
167
  )
159
-
168
+
160
169
  return ExecuteResult(
161
170
  stdout=b64decode(data["stdout"]),
162
171
  stderr=b64decode(data["stderr"]),
163
- exit_code=data["exit_code"]
172
+ exit_code=data["exit_code"],
164
173
  )
165
174
 
166
-
167
- async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
175
+ async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
168
176
  """
169
177
  Invoke a function in the environment.
170
178
  """
@@ -174,9 +182,8 @@ class RemoteClient(Client):
174
182
  json=config.model_dump(),
175
183
  api_key=settings.api_key,
176
184
  )
177
-
178
- return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
179
185
 
186
+ return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
180
187
 
181
188
  async def close(self) -> None:
182
189
  """
@@ -5,6 +5,7 @@ from base64 import b64decode, b64encode
5
5
  from typing import Any
6
6
 
7
7
  from hud.env.docker_client import DockerClient
8
+ from hud.exceptions import HudResponseError
8
9
  from hud.server import make_request
9
10
  from hud.settings import settings
10
11
  from hud.types import EnvironmentStatus
@@ -39,7 +40,10 @@ class RemoteDockerClient(DockerClient):
39
40
  metadata: Metadata to associate with the environment
40
41
 
41
42
  Returns:
42
- RemoteClient: An instance of the remote environment client
43
+ A tuple containing the remote environment client and the build metadata
44
+
45
+ Raises:
46
+ HudResponseError: If the environment creation fails.
43
47
  """
44
48
 
45
49
  # Validate arguments
@@ -48,7 +52,7 @@ class RemoteDockerClient(DockerClient):
48
52
 
49
53
  logger.info("Creating remote environment")
50
54
 
51
- true_gym_id = await get_gym_id("local-docker")
55
+ true_gym_id = await get_gym_id("docker")
52
56
 
53
57
  # augment metadata with dockerfile
54
58
  if "environment_config" not in metadata:
@@ -73,7 +77,13 @@ class RemoteDockerClient(DockerClient):
73
77
  # Get the environment ID from the response
74
78
  env_id = response.get("id")
75
79
  if not env_id:
76
- raise ValueError("Failed to create remote environment: No ID returned")
80
+ raise HudResponseError(
81
+ message=(
82
+ "Failed to create remote environment: No ID returned in API response. "
83
+ "Please contact support if this issue persists."
84
+ ),
85
+ response_json=response,
86
+ )
77
87
 
78
88
  # Create the controller instance
79
89
  controller = cls(env_id)
@@ -1,10 +1,9 @@
1
1
  """
2
2
  Evaluators for assessing task responses.
3
3
  """
4
+
4
5
  from __future__ import annotations
5
6
 
6
7
  from hud.evaluators.base import Evaluator
7
8
 
8
- __all__ = [
9
- "Evaluator"
10
- ]
9
+ __all__ = ["Evaluator"]
hud/evaluators/base.py CHANGED
@@ -11,21 +11,22 @@ if TYPE_CHECKING:
11
11
 
12
12
  class EvaluationResult(BaseModel):
13
13
  """Result of an evaluation.
14
-
14
+
15
15
  Attributes:
16
16
  score: Float score between 0 and 1
17
17
  reason: Explanation of the evaluation
18
18
  mode: Mode used for matching, if applicable
19
19
  """
20
-
20
+
21
21
  score: float
22
22
  reason: str
23
23
  mode: str | None = None
24
24
  criteria_scores: dict[str, float] | None = Field(default_factory=dict)
25
25
 
26
+
26
27
  class Evaluator(ABC):
27
28
  """Abstract base class for evaluators."""
28
-
29
+
29
30
  @abstractmethod
30
31
  def evaluate(self, task: Task, response: str) -> EvaluationResult:
31
32
  """Evaluate a task and response."""
hud/evaluators/inspect.py CHANGED
@@ -10,20 +10,15 @@ def inspect_evaluate(
10
10
  answer: Any,
11
11
  ) -> EvaluationResult:
12
12
  """Evaluate using Inspect-ai's evaluation models.
13
-
13
+
14
14
  Args:
15
15
  response: The response to evaluate
16
16
  answer: The reference answer to compare against
17
17
  model_name: The Inspect model to use
18
18
  prompt: Optional custom prompt for evaluation
19
19
  metrics: Optional list of metrics to evaluate against
20
-
20
+
21
21
  Returns:
22
22
  EvaluationResult with the evaluation results
23
23
  """
24
- return EvaluationResult(
25
- score=0.0,
26
- reason="Inspect evaluation not implemented",
27
- mode="inspect"
28
- )
29
-
24
+ return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
hud/evaluators/judge.py CHANGED
@@ -11,33 +11,26 @@ from hud.settings import settings
11
11
 
12
12
  class LLM(Protocol):
13
13
  """Protocol for LLM interfaces that can be used for evaluation."""
14
- async def ainvoke(self, prompt: str) -> str: ...
14
+
15
+ async def ainvoke(self, prompt: str, /) -> str: ...
15
16
 
16
17
 
17
18
  class Criterion(TypedDict, total=False):
18
19
  """Criterion for judge-based evaluation."""
19
-
20
+
20
21
  description: str
21
22
  weight: float
22
23
 
23
24
 
24
25
  async def _call_eval_endpoint(
25
- response: Any,
26
- answer: Any,
27
- criteria: list[Any],
28
- mode: str
26
+ response: Any, answer: Any, criteria: list[Any], mode: str
29
27
  ) -> dict[str, Any]:
30
28
  """Call the run_eval endpoint to evaluate the response."""
31
29
  try:
32
30
  result = await make_request(
33
31
  method="POST",
34
32
  url=f"{settings.base_url}/evaluations/run_eval",
35
- json={
36
- "response": response,
37
- "answer": answer,
38
- "criteria": criteria,
39
- "mode": mode
40
- },
33
+ json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
41
34
  api_key=settings.api_key,
42
35
  )
43
36
  return result
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
46
39
  return {
47
40
  "score": -1.0,
48
41
  "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
49
- "criteria_scores": {}
42
+ "criteria_scores": {},
50
43
  }
51
44
 
52
45
 
53
- def _determine_mode(answer: Any) -> str:
54
- """Determine the evaluation mode based on answer type."""
55
- if isinstance(answer, bytes) or _is_base64_image(answer):
56
- return "VLM"
57
- return "LLM"
58
-
59
-
60
46
  def _process_input(data: Any) -> Any:
61
47
  """Process input data, detecting and handling base64 images."""
62
48
  if isinstance(data, bytes):
63
49
  # Convert bytes to base64 string
64
50
  return base64.b64encode(data).decode("utf-8")
65
-
51
+
66
52
  if isinstance(data, str) and _is_base64_image(data):
67
53
  # It's already a base64 string, just return it
68
54
  return data
69
-
55
+
70
56
  if isinstance(data, list) and all(isinstance(item, str) for item in data):
71
57
  # Process list of strings
72
58
  return data
73
-
59
+
74
60
  # For other types, convert to string
75
61
  return str(data) if not isinstance(data, str | dict) else data
76
62
 
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
79
65
  """Check if a string is a base64 encoded image."""
80
66
  if not isinstance(data, str):
81
67
  return False
82
-
68
+
83
69
  # Check for common image data URI pattern
84
70
  if data.startswith(("data:image/", "data:application/octet-stream")):
85
71
  return True
86
-
72
+
87
73
  # Check if it's a base64 encoded string with image header
88
74
  try:
89
75
  # First, validate it's base64 decodable
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
95
81
  sample = base64.b64decode(data[:30])
96
82
 
97
83
  # Check for common image format signatures
98
- return (
99
- sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
100
- )
84
+ return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
101
85
  except Exception:
102
86
  return False
103
87
 
@@ -109,50 +93,46 @@ def judge(
109
93
  criteria: list[str] | list[dict] | None = None,
110
94
  ) -> EvaluationResult:
111
95
  """Judge a response against an answer using an LLM.
112
-
96
+
113
97
  Args:
114
98
  response: The response to evaluate
115
99
  answer: The reference answer to compare against
116
100
  llm: Optional langchain LLM to use for evaluation
117
101
  criteria: Evaluation criteria as strings or dictionaries
118
-
102
+
119
103
  Returns:
120
104
  EvaluationResult with evaluation results
121
105
  """
122
106
  # Process inputs
123
107
  processed_response = _process_input(response)
124
108
  processed_answer = _process_input(answer)
125
-
109
+
126
110
  # If LLM is provided, use it for evaluation
127
111
  if llm:
128
112
  return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
129
-
113
+
130
114
  # Otherwise, use the remote evaluation service
131
115
  mode = "LLM"
132
116
  if isinstance(answer, bytes) or _is_base64_image(answer):
133
117
  mode = "VLM"
134
-
118
+
135
119
  # Call the eval endpoint synchronously
136
- result = asyncio.run(_call_eval_endpoint(
137
- response=processed_response,
138
- answer=processed_answer,
139
- criteria=criteria or [],
140
- mode=mode
141
- ))
142
-
120
+ result = asyncio.run(
121
+ _call_eval_endpoint(
122
+ response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
123
+ )
124
+ )
125
+
143
126
  return EvaluationResult(
144
127
  score=result.get("score", -1.0),
145
128
  reason=result.get("reason", "Response evaluated"),
146
129
  mode=mode,
147
- criteria_scores=result.get("criteria_scores", {})
130
+ criteria_scores=result.get("criteria_scores", {}),
148
131
  )
149
132
 
150
133
 
151
134
  def _evaluate_with_llm(
152
- response: Any,
153
- answer: Any,
154
- llm: LLM,
155
- criteria: list[str] | list[dict] | None = None
135
+ response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
156
136
  ) -> EvaluationResult:
157
137
  """Evaluate a response against an answer using a provided LLM."""
158
138
  criteria_text = ""
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
163
143
  criteria_text += f"- {c['description']}\n"
164
144
  elif isinstance(c, str):
165
145
  criteria_text += f"- {c}\n"
166
-
146
+
167
147
  prompt = f"""Evaluate the quality of a response given a reference answer.
168
148
 
169
149
  REFERENCE ANSWER:
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
181
161
  try:
182
162
  # Run the evaluation asynchronously
183
163
  result_text = asyncio.run(llm.ainvoke(prompt))
184
-
164
+
185
165
  # Attempt to parse JSON response
186
166
  import json
187
167
  import re
188
-
168
+
189
169
  # Try to extract JSON if wrapped in other text
190
170
  json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
191
171
  if json_match:
192
172
  json_str = json_match.group(0)
193
173
  result = json.loads(json_str)
194
-
174
+
195
175
  return EvaluationResult(
196
176
  score=float(result.get("score", 0.5)),
197
177
  reason=result.get("reason", "Evaluated with custom LLM"),
198
- mode="custom_llm"
178
+ mode="custom_llm",
199
179
  )
200
-
180
+
201
181
  # If can't parse as JSON, use default values
202
182
  return EvaluationResult(
203
183
  score=0.5,
204
184
  reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
205
- mode="custom_llm"
185
+ mode="custom_llm",
206
186
  )
207
-
187
+
208
188
  except Exception as e:
209
- return EvaluationResult(
210
- score=0.0,
211
- reason=f"LLM evaluation error: {e!s}",
212
- mode="custom_llm"
213
- )
189
+ return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")