hud-python 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show
  1. hud/__init__.py +16 -12
  2. hud/adapters/__init__.py +4 -2
  3. hud/adapters/claude/adapter.py +0 -1
  4. hud/adapters/common/adapter.py +11 -10
  5. hud/adapters/common/types.py +27 -13
  6. hud/adapters/operator/__init__.py +5 -0
  7. hud/adapters/operator/adapter.py +93 -0
  8. hud/agent/__init__.py +7 -0
  9. hud/agent/base.py +109 -0
  10. hud/agent/claude.py +187 -0
  11. hud/agent/operator.py +190 -0
  12. hud/env/__init__.py +11 -0
  13. hud/env/client.py +35 -0
  14. hud/env/docker_client.py +306 -0
  15. hud/env/environment.py +181 -0
  16. hud/env/local_docker_client.py +249 -0
  17. hud/env/remote_client.py +185 -0
  18. hud/env/remote_docker_client.py +221 -0
  19. hud/evaluators/__init__.py +10 -0
  20. hud/evaluators/base.py +31 -0
  21. hud/evaluators/inspect.py +29 -0
  22. hud/evaluators/judge.py +213 -0
  23. hud/evaluators/match.py +163 -0
  24. hud/evaluators/remote.py +78 -0
  25. hud/gym.py +101 -15
  26. hud/job.py +185 -0
  27. hud/server/__init__.py +2 -2
  28. hud/server/requests.py +87 -0
  29. hud/settings.py +13 -2
  30. hud/task.py +133 -0
  31. hud/taskset.py +95 -0
  32. hud/trajectory.py +90 -0
  33. hud/types.py +65 -0
  34. hud/utils/__init__.py +4 -2
  35. hud/utils/common.py +69 -0
  36. hud/utils/config.py +182 -4
  37. hud/utils/telemetry.py +67 -0
  38. hud_python-0.2.0.dist-info/METADATA +188 -0
  39. hud_python-0.2.0.dist-info/RECORD +44 -0
  40. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/licenses/LICENSE +1 -1
  41. hud/client.py +0 -200
  42. hud/environment.py +0 -318
  43. hud/run.py +0 -208
  44. hud_python-0.1.5.dist-info/METADATA +0 -125
  45. hud_python-0.1.5.dist-info/RECORD +0 -21
  46. {hud_python-0.1.5.dist-info → hud_python-0.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,221 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from base64 import b64decode, b64encode
5
+ from typing import Any
6
+
7
+ from hud.env.docker_client import DockerClient
8
+ from hud.server import make_request
9
+ from hud.settings import settings
10
+ from hud.types import EnvironmentStatus
11
+ from hud.utils import ExecuteResult
12
+ from hud.utils.common import get_gym_id
13
+
14
+ logger = logging.getLogger("hud.env.remote_env_client")
15
+
16
+
17
+ class RemoteDockerClient(DockerClient):
18
+ """
19
+ Remote environment client implementation.
20
+
21
+ Uses the HUD API to manage a remote environment.
22
+ """
23
+
24
+ @classmethod
25
+ async def create(
26
+ cls,
27
+ dockerfile: str,
28
+ *,
29
+ job_id: str | None = None,
30
+ task_id: str | None = None,
31
+ metadata: dict[str, Any] | None = None,
32
+ ) -> tuple[RemoteDockerClient, dict[str, Any]]:
33
+ """
34
+ Creates a remote environment client from a dockerfile or gym_id.
35
+
36
+ Args:
37
+ dockerfile: The dockerfile content to build the environment
38
+ gym_id: The gym_id of the environment to create
39
+ metadata: Metadata to associate with the environment
40
+
41
+ Returns:
42
+ RemoteClient: An instance of the remote environment client
43
+ """
44
+
45
+ # Validate arguments
46
+ if metadata is None:
47
+ metadata = {}
48
+
49
+ logger.info("Creating remote environment")
50
+
51
+ true_gym_id = await get_gym_id("local-docker")
52
+
53
+ # augment metadata with dockerfile
54
+ if "environment_config" not in metadata:
55
+ metadata["environment_config"] = {}
56
+
57
+ metadata["environment_config"]["dockerfile"] = dockerfile
58
+
59
+ # Create a new environment via the HUD API
60
+ response = await make_request(
61
+ method="POST",
62
+ url=f"{settings.base_url}/v2/create_environment",
63
+ json={
64
+ # still named run_id for backwards compatibility
65
+ "run_id": job_id,
66
+ "metadata": metadata,
67
+ "gym_id": true_gym_id,
68
+ "task_id": task_id,
69
+ },
70
+ api_key=settings.api_key,
71
+ )
72
+
73
+ # Get the environment ID from the response
74
+ env_id = response.get("id")
75
+ if not env_id:
76
+ raise ValueError("Failed to create remote environment: No ID returned")
77
+
78
+ # Create the controller instance
79
+ controller = cls(env_id)
80
+
81
+ build_metadata = response.get("metadata", {})
82
+
83
+ return controller, build_metadata
84
+
85
+ def __init__(self, env_id: str) -> None:
86
+ """
87
+ Initialize the RemoteClient.
88
+
89
+ Args:
90
+ env_id: ID of the remote environment to control
91
+ """
92
+ super().__init__()
93
+ self._env_id = env_id
94
+
95
+ @property
96
+ def env_id(self) -> str:
97
+ """The ID of the remote environment."""
98
+ return self._env_id
99
+
100
+ async def get_status(self) -> EnvironmentStatus:
101
+ """
102
+ Get the current status of the remote environment.
103
+
104
+ Returns:
105
+ EnvironmentStatus: The current status of the environment
106
+ """
107
+ try:
108
+ response = await make_request(
109
+ method="GET",
110
+ url=f"{settings.base_url}/v2/environments/{self.env_id}/state",
111
+ api_key=settings.api_key,
112
+ )
113
+ logger.debug("Environment status response: %s", response)
114
+
115
+ status = response.get("state", "").lower()
116
+
117
+ if status == "running":
118
+ return EnvironmentStatus.RUNNING
119
+ elif status == "initializing" or status == "pending":
120
+ return EnvironmentStatus.INITIALIZING
121
+ elif status == "completed" or status == "terminated":
122
+ return EnvironmentStatus.COMPLETED
123
+ else:
124
+ # Any other status is considered an error
125
+ logger.warning("Abnormal environment status response: %s", response)
126
+ return EnvironmentStatus.ERROR
127
+
128
+ except Exception:
129
+ # If we can't connect to the API or there's any other error
130
+ logger.info("(potentially transient) Error getting environment status")
131
+ return EnvironmentStatus.ERROR
132
+
133
+ async def execute(
134
+ self,
135
+ command: list[str],
136
+ *,
137
+ workdir: str | None = None,
138
+ timeout: float | None = None,
139
+ ) -> ExecuteResult:
140
+ """
141
+ Execute a command in the environment.
142
+ No-op in some environments (like browser use).
143
+
144
+ Args:
145
+ command: Command to execute
146
+ workdir: Working directory for the command (ignored for remote environments)
147
+
148
+ Returns:
149
+ ExecuteResult: Result of the command execution
150
+ """
151
+ data = await make_request(
152
+ method="POST",
153
+ url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
154
+ json={
155
+ "command": command,
156
+ "workdir": workdir,
157
+ "timeout": timeout,
158
+ },
159
+ api_key=settings.api_key,
160
+ )
161
+
162
+ return ExecuteResult(
163
+ stdout=b64decode(data["stdout"]),
164
+ stderr=b64decode(data["stderr"]),
165
+ exit_code=data["exit_code"],
166
+ )
167
+
168
+ async def get_archive(self, path: str) -> bytes:
169
+ """
170
+ Get an archive of a path from the environment.
171
+ May not be supported for all environments.
172
+
173
+ Args:
174
+ path: Path in the environment to archive
175
+
176
+ Returns:
177
+ bytes: Content of the file or archive
178
+ """
179
+ data = await make_request(
180
+ method="POST",
181
+ url=f"{settings.base_url}/v2/environments/{self.env_id}/get_archive",
182
+ json={"path": path},
183
+ api_key=settings.api_key,
184
+ )
185
+
186
+ # Return the content decoded from base64
187
+ return b64decode(data["content"])
188
+
189
+ async def put_archive(self, path: str, data: bytes) -> bool:
190
+ """
191
+ Put an archive of data at a path in the environment.
192
+ May not be supported for all environments.
193
+
194
+ Args:
195
+ path: Path in the environment to extract the archive to
196
+ data: Bytes of the data to send
197
+
198
+ Returns:
199
+ bool: True if successful
200
+ """
201
+ await make_request(
202
+ method="POST",
203
+ url=f"{settings.base_url}/v2/environments/{self.env_id}/put_archive",
204
+ json={
205
+ "path": path,
206
+ "content": b64encode(data).decode("utf-8"),
207
+ },
208
+ api_key=settings.api_key,
209
+ )
210
+
211
+ return True
212
+
213
+ async def close(self) -> None:
214
+ """
215
+ Close the remote environment by making a request to the server.
216
+ """
217
+ await make_request(
218
+ method="POST",
219
+ url=f"{settings.base_url}/v2/environments/{self.env_id}/close",
220
+ api_key=settings.api_key,
221
+ )
@@ -0,0 +1,10 @@
1
+ """
2
+ Evaluators for assessing task responses.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ from hud.evaluators.base import Evaluator
7
+
8
+ __all__ = [
9
+ "Evaluator"
10
+ ]
hud/evaluators/base.py ADDED
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING
5
+
6
+ from pydantic import BaseModel, Field
7
+
8
+ if TYPE_CHECKING:
9
+ from hud.task import Task
10
+
11
+
12
+ class EvaluationResult(BaseModel):
13
+ """Result of an evaluation.
14
+
15
+ Attributes:
16
+ score: Float score between 0 and 1
17
+ reason: Explanation of the evaluation
18
+ mode: Mode used for matching, if applicable
19
+ """
20
+
21
+ score: float
22
+ reason: str
23
+ mode: str | None = None
24
+ criteria_scores: dict[str, float] | None = Field(default_factory=dict)
25
+
26
+ class Evaluator(ABC):
27
+ """Abstract base class for evaluators."""
28
+
29
+ @abstractmethod
30
+ def evaluate(self, task: Task, response: str) -> EvaluationResult:
31
+ """Evaluate a task and response."""
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ from hud.evaluators.base import EvaluationResult
6
+
7
+
8
+ def inspect_evaluate(
9
+ response: Any,
10
+ answer: Any,
11
+ ) -> EvaluationResult:
12
+ """Evaluate using Inspect-ai's evaluation models.
13
+
14
+ Args:
15
+ response: The response to evaluate
16
+ answer: The reference answer to compare against
17
+ model_name: The Inspect model to use
18
+ prompt: Optional custom prompt for evaluation
19
+ metrics: Optional list of metrics to evaluate against
20
+
21
+ Returns:
22
+ EvaluationResult with the evaluation results
23
+ """
24
+ return EvaluationResult(
25
+ score=0.0,
26
+ reason="Inspect evaluation not implemented",
27
+ mode="inspect"
28
+ )
29
+
@@ -0,0 +1,213 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ from typing import Any, Protocol, TypedDict
6
+
7
+ from hud.evaluators.base import EvaluationResult
8
+ from hud.server import make_request
9
+ from hud.settings import settings
10
+
11
+
12
+ class LLM(Protocol):
13
+ """Protocol for LLM interfaces that can be used for evaluation."""
14
+ async def ainvoke(self, prompt: str) -> str: ...
15
+
16
+
17
+ class Criterion(TypedDict, total=False):
18
+ """Criterion for judge-based evaluation."""
19
+
20
+ description: str
21
+ weight: float
22
+
23
+
24
+ async def _call_eval_endpoint(
25
+ response: Any,
26
+ answer: Any,
27
+ criteria: list[Any],
28
+ mode: str
29
+ ) -> dict[str, Any]:
30
+ """Call the run_eval endpoint to evaluate the response."""
31
+ try:
32
+ result = await make_request(
33
+ method="POST",
34
+ url=f"{settings.base_url}/evaluations/run_eval",
35
+ json={
36
+ "response": response,
37
+ "answer": answer,
38
+ "criteria": criteria,
39
+ "mode": mode
40
+ },
41
+ api_key=settings.api_key,
42
+ )
43
+ return result
44
+ except Exception as e:
45
+ # Fallback to local evaluation if remote call fails
46
+ return {
47
+ "score": -1.0,
48
+ "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
49
+ "criteria_scores": {}
50
+ }
51
+
52
+
53
+ def _determine_mode(answer: Any) -> str:
54
+ """Determine the evaluation mode based on answer type."""
55
+ if isinstance(answer, bytes) or _is_base64_image(answer):
56
+ return "VLM"
57
+ return "LLM"
58
+
59
+
60
+ def _process_input(data: Any) -> Any:
61
+ """Process input data, detecting and handling base64 images."""
62
+ if isinstance(data, bytes):
63
+ # Convert bytes to base64 string
64
+ return base64.b64encode(data).decode("utf-8")
65
+
66
+ if isinstance(data, str) and _is_base64_image(data):
67
+ # It's already a base64 string, just return it
68
+ return data
69
+
70
+ if isinstance(data, list) and all(isinstance(item, str) for item in data):
71
+ # Process list of strings
72
+ return data
73
+
74
+ # For other types, convert to string
75
+ return str(data) if not isinstance(data, str | dict) else data
76
+
77
+
78
+ def _is_base64_image(data: Any) -> bool:
79
+ """Check if a string is a base64 encoded image."""
80
+ if not isinstance(data, str):
81
+ return False
82
+
83
+ # Check for common image data URI pattern
84
+ if data.startswith(("data:image/", "data:application/octet-stream")):
85
+ return True
86
+
87
+ # Check if it's a base64 encoded string with image header
88
+ try:
89
+ # First, validate it's base64 decodable
90
+ padding_needed = len(data) % 4
91
+ if padding_needed:
92
+ data += "=" * (4 - padding_needed)
93
+
94
+ # Try to decode the first few bytes to check for image signatures
95
+ sample = base64.b64decode(data[:30])
96
+
97
+ # Check for common image format signatures
98
+ return (
99
+ sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
100
+ )
101
+ except Exception:
102
+ return False
103
+
104
+
105
+ def judge(
106
+ response: Any,
107
+ answer: Any,
108
+ llm: LLM | None = None,
109
+ criteria: list[str] | list[dict] | None = None,
110
+ ) -> EvaluationResult:
111
+ """Judge a response against an answer using an LLM.
112
+
113
+ Args:
114
+ response: The response to evaluate
115
+ answer: The reference answer to compare against
116
+ llm: Optional langchain LLM to use for evaluation
117
+ criteria: Evaluation criteria as strings or dictionaries
118
+
119
+ Returns:
120
+ EvaluationResult with evaluation results
121
+ """
122
+ # Process inputs
123
+ processed_response = _process_input(response)
124
+ processed_answer = _process_input(answer)
125
+
126
+ # If LLM is provided, use it for evaluation
127
+ if llm:
128
+ return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
129
+
130
+ # Otherwise, use the remote evaluation service
131
+ mode = "LLM"
132
+ if isinstance(answer, bytes) or _is_base64_image(answer):
133
+ mode = "VLM"
134
+
135
+ # Call the eval endpoint synchronously
136
+ result = asyncio.run(_call_eval_endpoint(
137
+ response=processed_response,
138
+ answer=processed_answer,
139
+ criteria=criteria or [],
140
+ mode=mode
141
+ ))
142
+
143
+ return EvaluationResult(
144
+ score=result.get("score", -1.0),
145
+ reason=result.get("reason", "Response evaluated"),
146
+ mode=mode,
147
+ criteria_scores=result.get("criteria_scores", {})
148
+ )
149
+
150
+
151
+ def _evaluate_with_llm(
152
+ response: Any,
153
+ answer: Any,
154
+ llm: LLM,
155
+ criteria: list[str] | list[dict] | None = None
156
+ ) -> EvaluationResult:
157
+ """Evaluate a response against an answer using a provided LLM."""
158
+ criteria_text = ""
159
+ if criteria:
160
+ criteria_text = "Use the following criteria:\n"
161
+ for c in criteria:
162
+ if isinstance(c, dict) and "description" in c:
163
+ criteria_text += f"- {c['description']}\n"
164
+ elif isinstance(c, str):
165
+ criteria_text += f"- {c}\n"
166
+
167
+ prompt = f"""Evaluate the quality of a response given a reference answer.
168
+
169
+ REFERENCE ANSWER:
170
+ {answer}
171
+
172
+ RESPONSE TO EVALUATE:
173
+ {response}
174
+
175
+ {criteria_text}
176
+ Rate the response on a scale from 0.0 to 1.0, where 1.0 is perfect.
177
+ Provide a brief explanation for your rating.
178
+ Format your answer as a JSON object with 'score' (float) and 'reason' (string) fields.
179
+ """
180
+
181
+ try:
182
+ # Run the evaluation asynchronously
183
+ result_text = asyncio.run(llm.ainvoke(prompt))
184
+
185
+ # Attempt to parse JSON response
186
+ import json
187
+ import re
188
+
189
+ # Try to extract JSON if wrapped in other text
190
+ json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
191
+ if json_match:
192
+ json_str = json_match.group(0)
193
+ result = json.loads(json_str)
194
+
195
+ return EvaluationResult(
196
+ score=float(result.get("score", 0.5)),
197
+ reason=result.get("reason", "Evaluated with custom LLM"),
198
+ mode="custom_llm"
199
+ )
200
+
201
+ # If can't parse as JSON, use default values
202
+ return EvaluationResult(
203
+ score=0.5,
204
+ reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
205
+ mode="custom_llm"
206
+ )
207
+
208
+ except Exception as e:
209
+ return EvaluationResult(
210
+ score=0.0,
211
+ reason=f"LLM evaluation error: {e!s}",
212
+ mode="custom_llm"
213
+ )
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from difflib import SequenceMatcher
5
+ from typing import Any
6
+
7
+ from textdistance import levenshtein
8
+
9
+ from hud.evaluators.base import EvaluationResult
10
+
11
+
12
+ def match_single(response: Any, answer: Any) -> EvaluationResult:
13
+ """Check if the answer is present within the response.
14
+
15
+ Args:
16
+ response: The response to evaluate
17
+ answer: The expected answer
18
+
19
+ Returns:
20
+ EvaluationResult with score=1.0 if match, 0.0 otherwise
21
+ """
22
+ passed = str(answer).lower().strip() in str(response).lower().strip()
23
+ return EvaluationResult(
24
+ score=1.0 if passed else 0.0,
25
+ reason="Exact match" if passed else "No exact match found",
26
+ mode="single"
27
+ )
28
+
29
+
30
+ def match_all(response: Any, answers: list) -> EvaluationResult:
31
+ """Count how many expected answers are in the response.
32
+
33
+ Args:
34
+ response: The response to evaluate
35
+ answers: List of expected answers
36
+
37
+ Returns:
38
+ EvaluationResult with score=proportion of matches (0.0-1.0)
39
+ """
40
+ response_str = str(response).lower()
41
+ matches = 0
42
+
43
+ for answer in answers:
44
+ if str(answer).lower() in response_str:
45
+ matches += 1
46
+
47
+ score = matches / len(answers) if answers else 0.0
48
+
49
+ if matches == len(answers):
50
+ reason = f"All {matches} expected items found"
51
+ else:
52
+ reason = f"Only {matches} of {len(answers)} expected items found"
53
+
54
+ return EvaluationResult(
55
+ score=score,
56
+ reason=reason,
57
+ mode="all"
58
+ )
59
+
60
+
61
+ def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
62
+ """Calculate similarity using Levenshtein distance.
63
+
64
+ Args:
65
+ response: The response to evaluate
66
+ answer: The expected answer
67
+
68
+ Returns:
69
+ EvaluationResult with score=similarity (0.0-1.0)
70
+ """
71
+ s1 = str(response).lower()
72
+ s2 = str(answer).lower()
73
+
74
+ if s1 == s2:
75
+ score = 1.0
76
+ elif len(s1) == 0 or len(s2) == 0:
77
+ score = 0.0
78
+ else:
79
+ # Use Levenshtein distance
80
+ distance = levenshtein.distance(s1, s2)
81
+ max_len = max(len(s1), len(s2))
82
+ score = 1.0 - (distance / max_len)
83
+
84
+ return EvaluationResult(
85
+ score=score,
86
+ reason=f"Fuzzy match with {score:.1%} similarity",
87
+ mode="fuzz"
88
+ )
89
+
90
+
91
+ def match_regex(response: Any, pattern: str) -> EvaluationResult:
92
+ """Check if response matches regex pattern.
93
+
94
+ Args:
95
+ response: The response to evaluate
96
+ pattern: Regular expression pattern to match
97
+
98
+ Returns:
99
+ EvaluationResult with score=1.0 if match, 0.0 otherwise
100
+ """
101
+ try:
102
+ regex = re.compile(pattern, re.DOTALL)
103
+ passed = bool(regex.search(str(response)))
104
+ return EvaluationResult(
105
+ score=1.0 if passed else 0.0,
106
+ reason="Regex pattern matched" if passed else "Regex pattern did not match",
107
+ mode="regex"
108
+ )
109
+ except re.error:
110
+ return EvaluationResult(
111
+ score=0.0,
112
+ reason="Invalid regex pattern",
113
+ mode="regex"
114
+ )
115
+
116
+
117
+ def match_diff(response: Any, answer: Any) -> EvaluationResult:
118
+ """Compare difference between response and answer.
119
+
120
+ Args:
121
+ response: The response to evaluate
122
+ answer: The expected answer
123
+
124
+ Returns:
125
+ EvaluationResult with score=similarity (0.0-1.0)
126
+ """
127
+ if isinstance(response, int | float) and isinstance(answer, int | float):
128
+ score = _match_numeric_diff(response, answer)
129
+ reason = f"Numeric difference: {abs(response - answer)}"
130
+ else:
131
+ score = _match_string_diff(response, answer)
132
+ reason = f"String difference with {score:.1%} similarity"
133
+
134
+ return EvaluationResult(
135
+ score=score,
136
+ reason=reason,
137
+ mode="diff"
138
+ )
139
+
140
+
141
+ def _match_string_diff(response: Any, answer: Any) -> float:
142
+ """Compare difference between response and answer strings."""
143
+ matcher = SequenceMatcher(None, str(response), str(answer))
144
+ return matcher.ratio()
145
+
146
+
147
+ def _match_numeric_diff(response: float, answer: float) -> float:
148
+ """Calculate normalized difference between numeric values.
149
+
150
+ Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
151
+ """
152
+ if response == answer:
153
+ return 1.0
154
+
155
+ # Simple absolute difference normalized to a 0-1 scale
156
+ diff = abs(response - answer)
157
+ max_val = max(abs(response), abs(answer))
158
+
159
+ if max_val == 0:
160
+ return 1.0 # Both are zero
161
+
162
+ # Normalize and invert so 1.0 means identical
163
+ return max(0.0, 1.0 - min(1.0, diff / max_val))