hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +5 -3
- hud/adapters/__init__.py +2 -1
- hud/adapters/claude/adapter.py +13 -17
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -6
- hud/adapters/operator/adapter.py +22 -29
- hud/agent/__init__.py +9 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +204 -0
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +40 -29
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +12 -10
- hud/job.py +525 -47
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +12 -22
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +14 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +37 -13
- hud/utils/config.py +44 -29
- hud/utils/progress.py +149 -0
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
- hud_python-0.2.3.dist-info/RECORD +62 -0
- hud_python-0.2.1.dist-info/RECORD +0 -44
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.evaluators.base import EvaluationResult
|
|
8
|
+
from hud.evaluators.judge import (
|
|
9
|
+
_call_eval_endpoint,
|
|
10
|
+
_evaluate_with_llm,
|
|
11
|
+
_is_base64_image,
|
|
12
|
+
_process_input,
|
|
13
|
+
judge,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _MockLLM:
|
|
18
|
+
"""Mock LLM for testing."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, response_text):
|
|
21
|
+
self.response_text = response_text
|
|
22
|
+
|
|
23
|
+
async def ainvoke(self, _prompt: str) -> str:
|
|
24
|
+
return self.response_text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.mark.parametrize(
|
|
28
|
+
"input_data, expected_result",
|
|
29
|
+
[
|
|
30
|
+
("Hello world", "Hello world"),
|
|
31
|
+
(123, "123"),
|
|
32
|
+
(["Hello", "world"], ["Hello", "world"]),
|
|
33
|
+
({"key": "value"}, {"key": "value"}),
|
|
34
|
+
(b"Hello world", base64.b64encode(b"Hello world").decode("utf-8")),
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
def test_process_input(input_data, expected_result):
|
|
38
|
+
"""Test processing various input types."""
|
|
39
|
+
result = _process_input(input_data)
|
|
40
|
+
assert result == expected_result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.mark.parametrize(
|
|
44
|
+
"input_data, expected_result",
|
|
45
|
+
[
|
|
46
|
+
("not an image", False),
|
|
47
|
+
("", True),
|
|
48
|
+
(b"not an image", False),
|
|
49
|
+
(123, False),
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
def test_is_base64_image(input_data, expected_result):
|
|
53
|
+
"""Test base64 image detection."""
|
|
54
|
+
assert _is_base64_image(input_data) == expected_result
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_is_base64_image_with_signatures(mocker):
|
|
58
|
+
"""Test base64 image detection with common image signatures."""
|
|
59
|
+
# Mock base64.b64decode to return different image signatures
|
|
60
|
+
mock_b64decode = mocker.patch("base64.b64decode")
|
|
61
|
+
|
|
62
|
+
# Test JPEG signature
|
|
63
|
+
mock_b64decode.return_value = b"\xff\xd8\xff" + b"some data"
|
|
64
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
65
|
+
|
|
66
|
+
# Test PNG signature
|
|
67
|
+
mock_b64decode.return_value = b"\x89PNG\r\n\x1a\n" + b"some data"
|
|
68
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
69
|
+
|
|
70
|
+
# Test GIF signature
|
|
71
|
+
mock_b64decode.return_value = b"GIF8" + b"some data"
|
|
72
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
73
|
+
|
|
74
|
+
# Test RIFF signature (WebP)
|
|
75
|
+
mock_b64decode.return_value = b"RIFF" + b"some data"
|
|
76
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.mark.asyncio
|
|
80
|
+
async def test_call_eval_endpoint_success(mocker):
|
|
81
|
+
"""Test successful remote evaluation call."""
|
|
82
|
+
mock_response = {
|
|
83
|
+
"score": 0.8,
|
|
84
|
+
"reason": "Good response",
|
|
85
|
+
"criteria_scores": {"relevance": 0.9, "accuracy": 0.7},
|
|
86
|
+
}
|
|
87
|
+
mock_make_request = mocker.patch(
|
|
88
|
+
"hud.evaluators.judge.make_request", return_value=mock_response
|
|
89
|
+
)
|
|
90
|
+
result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
|
|
91
|
+
assert result == mock_response
|
|
92
|
+
mock_make_request.assert_called_once()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@pytest.mark.asyncio
|
|
96
|
+
async def test_call_eval_endpoint_failure(mocker):
|
|
97
|
+
"""Test remote evaluation call failure."""
|
|
98
|
+
mocker.patch("hud.evaluators.judge.make_request", side_effect=Exception("API error"))
|
|
99
|
+
result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
|
|
100
|
+
assert result["score"] == -1.0
|
|
101
|
+
assert "Remote evaluation failed" in result["reason"]
|
|
102
|
+
assert result["criteria_scores"] == {}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_judge_without_llm(mocker):
|
|
106
|
+
"""Test judge function without custom LLM."""
|
|
107
|
+
mock_result = {
|
|
108
|
+
"score": 0.9,
|
|
109
|
+
"reason": "Good answer",
|
|
110
|
+
"criteria_scores": {"relevance": 1.0},
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async def mock_endpoint(*args, **kwargs):
|
|
114
|
+
return mock_result
|
|
115
|
+
|
|
116
|
+
mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
|
|
117
|
+
result = judge("test response", "test answer")
|
|
118
|
+
|
|
119
|
+
assert result.score == 0.9
|
|
120
|
+
assert result.reason == "Good answer"
|
|
121
|
+
assert result.mode == "LLM"
|
|
122
|
+
assert result.criteria_scores == {"relevance": 1.0}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_judge_with_image_answer(mocker):
|
|
126
|
+
"""Test judge function with an image as the answer."""
|
|
127
|
+
mock_result = {
|
|
128
|
+
"score": 0.85,
|
|
129
|
+
"reason": "Good image analysis",
|
|
130
|
+
"criteria_scores": {"visual_accuracy": 0.85},
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
async def mock_endpoint(*args, **kwargs):
|
|
134
|
+
return mock_result
|
|
135
|
+
|
|
136
|
+
mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
|
|
137
|
+
|
|
138
|
+
# Create a mock image
|
|
139
|
+
image_data = b"fake_image_data"
|
|
140
|
+
base64_image = base64.b64encode(image_data).decode("utf-8")
|
|
141
|
+
image_uri = f"data:image/jpeg;base64,{base64_image}"
|
|
142
|
+
|
|
143
|
+
result = judge("description of image", image_uri)
|
|
144
|
+
|
|
145
|
+
assert result.score == 0.85
|
|
146
|
+
assert result.reason == "Good image analysis"
|
|
147
|
+
assert result.mode == "VLM" # Should use VLM mode for images
|
|
148
|
+
assert result.criteria_scores == {"visual_accuracy": 0.85}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_judge_with_llm(mocker):
|
|
152
|
+
"""Test judge function with custom LLM."""
|
|
153
|
+
mock_llm = _MockLLM('{"score": 0.75, "reason": "Pretty good"}')
|
|
154
|
+
mock_result = EvaluationResult(score=0.75, reason="Pretty good", mode="custom_llm")
|
|
155
|
+
mocker.patch("hud.evaluators.judge._evaluate_with_llm", return_value=mock_result)
|
|
156
|
+
result = judge("test response", "test answer", llm=mock_llm)
|
|
157
|
+
assert result.score == 0.75
|
|
158
|
+
assert result.reason == "Pretty good"
|
|
159
|
+
assert result.mode == "custom_llm"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_evaluate_with_llm_valid_json():
|
|
163
|
+
"""Test _evaluate_with_llm with valid JSON response."""
|
|
164
|
+
llm = _MockLLM('{"score": 0.85, "reason": "The response is accurate and well-structured."}')
|
|
165
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
166
|
+
|
|
167
|
+
assert result.score == 0.85
|
|
168
|
+
assert result.reason == "The response is accurate and well-structured."
|
|
169
|
+
assert result.mode == "custom_llm"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_evaluate_with_llm_json_in_text():
|
|
173
|
+
"""Test _evaluate_with_llm with JSON embedded in text."""
|
|
174
|
+
llm_response = """
|
|
175
|
+
I've evaluated the response and here's my assessment:
|
|
176
|
+
|
|
177
|
+
{"score": 0.7, "reason": "Good but could be more detailed"}
|
|
178
|
+
|
|
179
|
+
I hope this helps!
|
|
180
|
+
"""
|
|
181
|
+
llm = _MockLLM(llm_response)
|
|
182
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
183
|
+
|
|
184
|
+
assert result.score == 0.7
|
|
185
|
+
assert result.reason == "Good but could be more detailed"
|
|
186
|
+
assert result.mode == "custom_llm"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_evaluate_with_llm_invalid_json():
|
|
190
|
+
"""Test _evaluate_with_llm with invalid JSON response."""
|
|
191
|
+
llm = _MockLLM("This is not a JSON response")
|
|
192
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
193
|
+
|
|
194
|
+
assert result.score == 0.5 # Default score for unparseable responses
|
|
195
|
+
assert "Unable to parse LLM response as JSON" in result.reason
|
|
196
|
+
assert result.mode == "custom_llm"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def test_evaluate_with_llm_exception(mocker):
|
|
200
|
+
"""Test _evaluate_with_llm when an exception occurs."""
|
|
201
|
+
# Mock the LLM to raise an exception
|
|
202
|
+
failing_llm = _MockLLM("doesn't matter")
|
|
203
|
+
mocker.patch.object(failing_llm, "ainvoke", side_effect=Exception("LLM API error"))
|
|
204
|
+
|
|
205
|
+
result = _evaluate_with_llm("test response", "test answer", failing_llm)
|
|
206
|
+
|
|
207
|
+
assert result.score == 0.0 # Zero score for errors
|
|
208
|
+
assert "LLM evaluation error: LLM API error" in result.reason
|
|
209
|
+
assert result.mode == "custom_llm"
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def test_evaluate_with_llm_with_criteria():
|
|
213
|
+
"""Test _evaluate_with_llm with evaluation criteria."""
|
|
214
|
+
llm = _MockLLM('{"score": 0.9, "reason": "Excellent match on all criteria"}')
|
|
215
|
+
|
|
216
|
+
# Test with string criteria
|
|
217
|
+
string_criteria = ["Accuracy", "Relevance", "Completeness"]
|
|
218
|
+
result = _evaluate_with_llm("test response", "test answer", llm, criteria=string_criteria)
|
|
219
|
+
|
|
220
|
+
assert result.score == 0.9
|
|
221
|
+
assert result.reason == "Excellent match on all criteria"
|
|
222
|
+
|
|
223
|
+
# Test with dict criteria
|
|
224
|
+
dict_criteria = [
|
|
225
|
+
{"description": "Factual accuracy", "weight": 0.6},
|
|
226
|
+
{"description": "Grammar and spelling", "weight": 0.4},
|
|
227
|
+
]
|
|
228
|
+
result = _evaluate_with_llm("test response", "test answer", llm, criteria=dict_criteria)
|
|
229
|
+
|
|
230
|
+
assert result.score == 0.9
|
|
231
|
+
assert result.reason == "Excellent match on all criteria"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from hud.evaluators.match import match_all, match_diff, match_fuzzy, match_regex, match_single
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.parametrize(
|
|
9
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
10
|
+
[
|
|
11
|
+
("Hello, world!", "world", 1.0, "Exact match", "single"),
|
|
12
|
+
("Hello, world!", "not world", 0.0, "No exact match found", "single"),
|
|
13
|
+
],
|
|
14
|
+
)
|
|
15
|
+
def test_match_single(
|
|
16
|
+
response: str,
|
|
17
|
+
answer: str,
|
|
18
|
+
expected_score: float,
|
|
19
|
+
expected_reason: str,
|
|
20
|
+
expected_mode: str,
|
|
21
|
+
):
|
|
22
|
+
result = match_single(response, answer)
|
|
23
|
+
assert result.score == expected_score
|
|
24
|
+
assert result.reason == expected_reason
|
|
25
|
+
assert result.mode == expected_mode
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.parametrize(
|
|
29
|
+
"response, answers, expected_score, expected_reason, expected_mode",
|
|
30
|
+
[
|
|
31
|
+
("Hello, world!", ["world", "hello"], 1.0, "All 2 expected items found", "all"),
|
|
32
|
+
("Hello, world!", ["world", "not hello"], 0.5, "Only 1 of 2 expected items found", "all"),
|
|
33
|
+
],
|
|
34
|
+
)
|
|
35
|
+
def test_match_all(
|
|
36
|
+
response: str,
|
|
37
|
+
answers: list[str],
|
|
38
|
+
expected_score: float,
|
|
39
|
+
expected_reason: str,
|
|
40
|
+
expected_mode: str,
|
|
41
|
+
):
|
|
42
|
+
result = match_all(response, answers)
|
|
43
|
+
assert result.score == expected_score
|
|
44
|
+
assert result.reason == expected_reason
|
|
45
|
+
assert result.mode == expected_mode
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.parametrize(
|
|
49
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
50
|
+
[
|
|
51
|
+
("hello world", "hello world", 1.0, "Fuzzy match with 100.0% similarity", "fuzz"),
|
|
52
|
+
("hello wrld", "hello world", 0.9, "Fuzzy match with 90.9% similarity", "fuzz"),
|
|
53
|
+
("hello", "hello world", 0.45, "Fuzzy match with 45.5% similarity", "fuzz"),
|
|
54
|
+
("", "hello world", 0.0, "Fuzzy match with 0.0% similarity", "fuzz"),
|
|
55
|
+
],
|
|
56
|
+
)
|
|
57
|
+
def test_match_fuzzy(
|
|
58
|
+
response: str,
|
|
59
|
+
answer: str,
|
|
60
|
+
expected_score: float,
|
|
61
|
+
expected_reason: str,
|
|
62
|
+
expected_mode: str,
|
|
63
|
+
):
|
|
64
|
+
result = match_fuzzy(response, answer)
|
|
65
|
+
assert result.score == pytest.approx(expected_score, abs=1e-2)
|
|
66
|
+
assert result.reason == expected_reason
|
|
67
|
+
assert result.mode == expected_mode
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.parametrize(
|
|
71
|
+
"response, pattern, expected_score, expected_reason, expected_mode",
|
|
72
|
+
[
|
|
73
|
+
("hello world", r"hello.*", 1.0, "Regex pattern matched", "regex"),
|
|
74
|
+
("hello world", r"^hello.*$", 1.0, "Regex pattern matched", "regex"),
|
|
75
|
+
("hello world", r"world$", 1.0, "Regex pattern matched", "regex"),
|
|
76
|
+
("hello world", r"^goodbye.*$", 0.0, "Regex pattern did not match", "regex"),
|
|
77
|
+
("hello world", r"[invalid[", 0.0, "Invalid regex pattern", "regex"),
|
|
78
|
+
],
|
|
79
|
+
)
|
|
80
|
+
def test_match_regex(
|
|
81
|
+
response: str,
|
|
82
|
+
pattern: str,
|
|
83
|
+
expected_score: float,
|
|
84
|
+
expected_reason: str,
|
|
85
|
+
expected_mode: str,
|
|
86
|
+
):
|
|
87
|
+
result = match_regex(response, pattern)
|
|
88
|
+
assert result.score == expected_score
|
|
89
|
+
assert result.reason == expected_reason
|
|
90
|
+
assert result.mode == expected_mode
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.mark.parametrize(
|
|
94
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
95
|
+
[
|
|
96
|
+
("hello world", "hello world", 1.0, "String difference with 100.0% similarity", "diff"),
|
|
97
|
+
("hello", "hello world", 0.625, "String difference with 62.5% similarity", "diff"),
|
|
98
|
+
("", "hello world", 0.0, "String difference with 0.0% similarity", "diff"),
|
|
99
|
+
(100, 100, 1.0, "Numeric difference: 0", "diff"),
|
|
100
|
+
(90, 100, 0.9, "Numeric difference: 10", "diff"),
|
|
101
|
+
(0, 100, 0.0, "Numeric difference: 100", "diff"),
|
|
102
|
+
(-100, 100, 0.0, "Numeric difference: 200", "diff"),
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
def test_match_diff(
|
|
106
|
+
response: str | int | float,
|
|
107
|
+
answer: str | int | float,
|
|
108
|
+
expected_score: float,
|
|
109
|
+
expected_reason: str,
|
|
110
|
+
expected_mode: str,
|
|
111
|
+
):
|
|
112
|
+
result = match_diff(response, answer)
|
|
113
|
+
assert result.score == pytest.approx(expected_score, abs=1e-2)
|
|
114
|
+
assert result.reason == expected_reason
|
|
115
|
+
assert result.mode == expected_mode
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from hud.evaluators.base import EvaluationResult
|
|
6
|
+
from hud.evaluators.remote import _remote_eval_call, remote_evaluate
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@pytest.mark.asyncio
|
|
10
|
+
async def test_remote_eval_call_success(mocker):
|
|
11
|
+
mock_response = {
|
|
12
|
+
"score": 0.85,
|
|
13
|
+
"reason": "Good match",
|
|
14
|
+
"details": {"relevance": 0.9, "correctness": 0.8},
|
|
15
|
+
}
|
|
16
|
+
mock_make_request = mocker.patch(
|
|
17
|
+
"hud.evaluators.remote.make_request", return_value=mock_response
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
result = await _remote_eval_call(
|
|
21
|
+
response="test response", answer="test answer", eval_type="match"
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
assert result == mock_response
|
|
25
|
+
mock_make_request.assert_called_once()
|
|
26
|
+
call_args = mock_make_request.call_args[1]
|
|
27
|
+
assert call_args["method"] == "POST"
|
|
28
|
+
assert "evaluations/evaluate" in call_args["url"]
|
|
29
|
+
assert call_args["json"]["response"] == "test response"
|
|
30
|
+
assert call_args["json"]["answer"] == "test answer"
|
|
31
|
+
assert call_args["json"]["type"] == "match"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@pytest.mark.asyncio
|
|
35
|
+
async def test_remote_eval_call_with_config(mocker):
|
|
36
|
+
mock_response = {"score": 0.75, "reason": "Good", "details": {}}
|
|
37
|
+
mock_make_request = mocker.patch(
|
|
38
|
+
"hud.evaluators.remote.make_request", return_value=mock_response
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
config = {"threshold": 0.8, "strict": True}
|
|
42
|
+
result = await _remote_eval_call(
|
|
43
|
+
response="test response", answer="test answer", eval_type="judge", config=config
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
assert result == mock_response
|
|
47
|
+
mock_make_request.assert_called_once()
|
|
48
|
+
call_args = mock_make_request.call_args[1]
|
|
49
|
+
assert call_args["json"]["config"] == config
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@pytest.mark.asyncio
|
|
53
|
+
async def test_remote_eval_call_failure(mocker):
|
|
54
|
+
mocker.patch("hud.evaluators.remote.make_request", side_effect=Exception("API error"))
|
|
55
|
+
|
|
56
|
+
result = await _remote_eval_call(
|
|
57
|
+
response="test response", answer="test answer", eval_type="match"
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert result["score"] == -1.0
|
|
61
|
+
assert "Remote evaluation failed" in result["reason"]
|
|
62
|
+
assert "API error" in result["reason"]
|
|
63
|
+
assert result["details"] == {}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_remote_evaluate(mocker):
|
|
67
|
+
mock_result = {"score": 0.9, "reason": "Excellent match", "details": {"similarity": 0.95}}
|
|
68
|
+
|
|
69
|
+
async def mock_remote_call(*args, **kwargs):
|
|
70
|
+
return mock_result
|
|
71
|
+
|
|
72
|
+
mocker.patch("hud.evaluators.remote._remote_eval_call", side_effect=mock_remote_call)
|
|
73
|
+
|
|
74
|
+
result = remote_evaluate(
|
|
75
|
+
response="test response", answer="test answer", eval_type="custom_eval"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
assert isinstance(result, EvaluationResult)
|
|
79
|
+
assert result.score == 0.9
|
|
80
|
+
assert result.reason == "Excellent match"
|
|
81
|
+
assert result.mode == "custom_eval"
|
|
82
|
+
assert result.criteria_scores == {"similarity": 0.95}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_remote_evaluate_missing_fields(mocker):
|
|
86
|
+
mock_result = {"score": 0.8} # Missing reason and details
|
|
87
|
+
|
|
88
|
+
async def mock_remote_call(*args, **kwargs):
|
|
89
|
+
return mock_result
|
|
90
|
+
|
|
91
|
+
mocker.patch("hud.evaluators.remote._remote_eval_call", side_effect=mock_remote_call)
|
|
92
|
+
|
|
93
|
+
result = remote_evaluate(response="test response", answer="test answer")
|
|
94
|
+
|
|
95
|
+
assert result.score == 0.8
|
|
96
|
+
assert result.reason == "Remote evaluation completed"
|
|
97
|
+
assert result.mode == "default"
|
|
98
|
+
assert result.criteria_scores == {}
|
hud/exceptions.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import httpx
|
|
8
|
+
from typing_extensions import Self
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HudException(Exception):
|
|
14
|
+
"""Base exception class for all HUD SDK errors.
|
|
15
|
+
|
|
16
|
+
This is the parent class for all exceptions raised by the HUD SDK.
|
|
17
|
+
Consumers should be able to catch this exception to handle any HUD-related error.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HudRequestError(Exception):
|
|
22
|
+
"""Any request to the HUD API can raise this exception."""
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
message: str,
|
|
27
|
+
status_code: int | None = None,
|
|
28
|
+
response_text: str | None = None,
|
|
29
|
+
response_json: dict[str, Any] | None = None,
|
|
30
|
+
response_headers: dict[str, str] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
self.message = message
|
|
33
|
+
self.status_code = status_code
|
|
34
|
+
self.response_text = response_text
|
|
35
|
+
self.response_json = response_json
|
|
36
|
+
self.response_headers = response_headers
|
|
37
|
+
super().__init__(message)
|
|
38
|
+
|
|
39
|
+
def __str__(self) -> str:
|
|
40
|
+
parts = [self.message]
|
|
41
|
+
|
|
42
|
+
if self.status_code:
|
|
43
|
+
parts.append(f"Status: {self.status_code}")
|
|
44
|
+
|
|
45
|
+
if self.response_text:
|
|
46
|
+
parts.append(f"Response Text: {self.response_text}")
|
|
47
|
+
|
|
48
|
+
if self.response_json:
|
|
49
|
+
parts.append(f"Response JSON: {self.response_json}")
|
|
50
|
+
|
|
51
|
+
if self.response_headers:
|
|
52
|
+
parts.append(f"Headers: {self.response_headers}")
|
|
53
|
+
|
|
54
|
+
return " | ".join(parts)
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_httpx_error(cls, error: httpx.HTTPStatusError, context: str = "") -> Self:
|
|
58
|
+
"""Create a RequestError from an HTTPx error response.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
error: The HTTPx error response.
|
|
62
|
+
context: Additional context to include in the error message.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
A RequestError instance.
|
|
66
|
+
"""
|
|
67
|
+
response = error.response
|
|
68
|
+
status_code = response.status_code
|
|
69
|
+
response_text = response.text
|
|
70
|
+
response_headers = dict(response.headers)
|
|
71
|
+
|
|
72
|
+
# Try to get detailed error info from JSON if available
|
|
73
|
+
response_json = None
|
|
74
|
+
try:
|
|
75
|
+
response_json = response.json()
|
|
76
|
+
detail = response_json.get("detail")
|
|
77
|
+
if detail:
|
|
78
|
+
message = f"Request failed: {detail}"
|
|
79
|
+
else:
|
|
80
|
+
# If no detail field but we have JSON, include a summary
|
|
81
|
+
message = f"Request failed with status {status_code}"
|
|
82
|
+
if len(response_json) <= 5: # If it's a small object, include it in the message
|
|
83
|
+
message += f" - JSON response: {response_json}"
|
|
84
|
+
except Exception:
|
|
85
|
+
# Fallback to simple message if JSON parsing fails
|
|
86
|
+
message = f"Request failed with status {status_code}"
|
|
87
|
+
|
|
88
|
+
# Add context if provided
|
|
89
|
+
if context:
|
|
90
|
+
message = f"{context}: {message}"
|
|
91
|
+
|
|
92
|
+
# Log the error details
|
|
93
|
+
logger.error(
|
|
94
|
+
"HTTP error from HUD SDK: %s | URL: %s | Status: %s | Response: %s%s",
|
|
95
|
+
message,
|
|
96
|
+
response.url,
|
|
97
|
+
status_code,
|
|
98
|
+
response_text[:500],
|
|
99
|
+
"..." if len(response_text) > 500 else "",
|
|
100
|
+
)
|
|
101
|
+
return cls(
|
|
102
|
+
message=message,
|
|
103
|
+
status_code=status_code,
|
|
104
|
+
response_text=response_text,
|
|
105
|
+
response_json=response_json,
|
|
106
|
+
response_headers=response_headers,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class HudResponseError(HudException):
|
|
111
|
+
"""Raised when an API response is invalid or missing required data.
|
|
112
|
+
|
|
113
|
+
This exception is raised when we receive a successful response (e.g. 200)
|
|
114
|
+
but the response data is invalid, missing required fields, or otherwise
|
|
115
|
+
cannot be processed.
|
|
116
|
+
|
|
117
|
+
Attributes:
|
|
118
|
+
message: A human-readable error message
|
|
119
|
+
response_json: The invalid response data
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
message: str,
|
|
125
|
+
response_json: dict[str, Any] | None = None,
|
|
126
|
+
) -> None:
|
|
127
|
+
self.message = message
|
|
128
|
+
self.response_json = response_json
|
|
129
|
+
super().__init__(message)
|
|
130
|
+
|
|
131
|
+
def __str__(self) -> str:
|
|
132
|
+
parts = [self.message]
|
|
133
|
+
if self.response_json:
|
|
134
|
+
parts.append(f"Response: {self.response_json}")
|
|
135
|
+
return " | ".join(parts)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class HudAuthenticationError(HudException):
|
|
139
|
+
"""Raised when authentication with the HUD API fails.
|
|
140
|
+
|
|
141
|
+
This exception is raised when an API key is missing, invalid, or
|
|
142
|
+
has insufficient permissions for the requested operation.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class HudRateLimitError(HudException):
|
|
147
|
+
"""Raised when the rate limit for the HUD API is exceeded.
|
|
148
|
+
|
|
149
|
+
This exception is raised when too many requests are made in a
|
|
150
|
+
short period of time.
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class HudTimeoutError(HudException):
|
|
155
|
+
"""Raised when a request to the HUD API times out.
|
|
156
|
+
|
|
157
|
+
This exception is raised when a request takes longer than the
|
|
158
|
+
configured timeout period.
|
|
159
|
+
"""
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class HudNetworkError(HudException):
|
|
163
|
+
"""Raised when there is a network-related error.
|
|
164
|
+
|
|
165
|
+
This exception is raised when there are issues with the network
|
|
166
|
+
connection, DNS resolution, or other network-related problems.
|
|
167
|
+
"""
|
hud/gym.py
CHANGED
|
@@ -8,15 +8,16 @@ from hud.env.environment import Environment
|
|
|
8
8
|
from hud.env.local_docker_client import LocalDockerClient
|
|
9
9
|
from hud.env.remote_client import RemoteClient
|
|
10
10
|
from hud.env.remote_docker_client import RemoteDockerClient
|
|
11
|
-
from hud.task import Task
|
|
12
11
|
from hud.types import CustomGym, Gym
|
|
13
12
|
from hud.utils.common import get_gym_id
|
|
14
13
|
|
|
15
14
|
if TYPE_CHECKING:
|
|
16
15
|
from hud.job import Job
|
|
16
|
+
from hud.task import Task
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger("hud.gym")
|
|
19
19
|
|
|
20
|
+
|
|
20
21
|
async def make(
|
|
21
22
|
env_src: Gym | Task,
|
|
22
23
|
*,
|
|
@@ -26,7 +27,7 @@ async def make(
|
|
|
26
27
|
) -> Environment:
|
|
27
28
|
"""
|
|
28
29
|
Create an environment from an environment ID or a Task object.
|
|
29
|
-
|
|
30
|
+
|
|
30
31
|
Args:
|
|
31
32
|
env_src: Environment ID or Task object
|
|
32
33
|
job: Job object to associate with this environment
|
|
@@ -35,7 +36,7 @@ async def make(
|
|
|
35
36
|
"""
|
|
36
37
|
if metadata is None:
|
|
37
38
|
metadata = {}
|
|
38
|
-
|
|
39
|
+
|
|
39
40
|
# Handle job parameter
|
|
40
41
|
effective_job_id = None
|
|
41
42
|
if job is not None:
|
|
@@ -45,18 +46,19 @@ async def make(
|
|
|
45
46
|
else:
|
|
46
47
|
# Try to get an active job from the decorator context
|
|
47
48
|
try:
|
|
48
|
-
|
|
49
|
-
|
|
49
|
+
import hud.job
|
|
50
|
+
|
|
51
|
+
active_job = hud.job.get_active_job()
|
|
50
52
|
if active_job:
|
|
51
53
|
effective_job_id = active_job.id
|
|
52
54
|
except ImportError:
|
|
53
55
|
pass # Module not available, skip
|
|
54
|
-
|
|
56
|
+
|
|
55
57
|
gym = None
|
|
56
58
|
task = None
|
|
57
|
-
if isinstance(env_src,
|
|
59
|
+
if isinstance(env_src, str | CustomGym):
|
|
58
60
|
gym = env_src
|
|
59
|
-
|
|
61
|
+
else:
|
|
60
62
|
gym = env_src.gym
|
|
61
63
|
task = env_src
|
|
62
64
|
|
|
@@ -77,7 +79,7 @@ async def make(
|
|
|
77
79
|
)
|
|
78
80
|
else:
|
|
79
81
|
raise ValueError(f"Invalid environment location: {gym.location}")
|
|
80
|
-
|
|
82
|
+
|
|
81
83
|
# Set up the environment with a source path
|
|
82
84
|
if gym.controller_source_dir:
|
|
83
85
|
logger.info("Setting source path")
|
|
@@ -101,7 +103,7 @@ async def make(
|
|
|
101
103
|
|
|
102
104
|
# Create the environment itself
|
|
103
105
|
environment = Environment(client=client, metadata=metadata, task=task, build_data=build_data)
|
|
104
|
-
|
|
106
|
+
|
|
105
107
|
if task:
|
|
106
108
|
await environment._setup()
|
|
107
109
|
|