hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +4 -3
- hud/adapters/claude/adapter.py +5 -14
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -3
- hud/adapters/operator/adapter.py +16 -23
- hud/agent/__init__.py +8 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +32 -26
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +39 -32
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +9 -7
- hud/job.py +179 -109
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +9 -19
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +12 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +36 -15
- hud/utils/config.py +45 -30
- hud/utils/progress.py +34 -21
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
- hud_python-0.2.4.dist-info/RECORD +62 -0
- hud_python-0.2.2.dist-info/RECORD +0 -46
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
- {hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0
hud/evaluators/match.py
CHANGED
|
@@ -2,20 +2,27 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from difflib import SequenceMatcher
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import TYPE_CHECKING, Protocol
|
|
6
6
|
|
|
7
7
|
from textdistance import levenshtein
|
|
8
8
|
|
|
9
9
|
from hud.evaluators.base import EvaluationResult
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
11
13
|
|
|
12
|
-
|
|
14
|
+
|
|
15
|
+
class _Stringable(Protocol):
|
|
16
|
+
def __str__(self) -> str: ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
13
20
|
"""Check if the answer is present within the response.
|
|
14
|
-
|
|
21
|
+
|
|
15
22
|
Args:
|
|
16
23
|
response: The response to evaluate
|
|
17
24
|
answer: The expected answer
|
|
18
|
-
|
|
25
|
+
|
|
19
26
|
Returns:
|
|
20
27
|
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
21
28
|
"""
|
|
@@ -23,54 +30,50 @@ def match_single(response: Any, answer: Any) -> EvaluationResult:
|
|
|
23
30
|
return EvaluationResult(
|
|
24
31
|
score=1.0 if passed else 0.0,
|
|
25
32
|
reason="Exact match" if passed else "No exact match found",
|
|
26
|
-
mode="single"
|
|
33
|
+
mode="single",
|
|
27
34
|
)
|
|
28
35
|
|
|
29
36
|
|
|
30
|
-
def match_all(response:
|
|
37
|
+
def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
|
|
31
38
|
"""Count how many expected answers are in the response.
|
|
32
|
-
|
|
39
|
+
|
|
33
40
|
Args:
|
|
34
41
|
response: The response to evaluate
|
|
35
42
|
answers: List of expected answers
|
|
36
|
-
|
|
43
|
+
|
|
37
44
|
Returns:
|
|
38
45
|
EvaluationResult with score=proportion of matches (0.0-1.0)
|
|
39
46
|
"""
|
|
40
47
|
response_str = str(response).lower()
|
|
41
48
|
matches = 0
|
|
42
|
-
|
|
49
|
+
|
|
43
50
|
for answer in answers:
|
|
44
51
|
if str(answer).lower() in response_str:
|
|
45
52
|
matches += 1
|
|
46
|
-
|
|
53
|
+
|
|
47
54
|
score = matches / len(answers) if answers else 0.0
|
|
48
|
-
|
|
55
|
+
|
|
49
56
|
if matches == len(answers):
|
|
50
57
|
reason = f"All {matches} expected items found"
|
|
51
58
|
else:
|
|
52
59
|
reason = f"Only {matches} of {len(answers)} expected items found"
|
|
53
|
-
|
|
54
|
-
return EvaluationResult(
|
|
55
|
-
score=score,
|
|
56
|
-
reason=reason,
|
|
57
|
-
mode="all"
|
|
58
|
-
)
|
|
59
60
|
|
|
61
|
+
return EvaluationResult(score=score, reason=reason, mode="all")
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
|
|
64
|
+
def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
62
65
|
"""Calculate similarity using Levenshtein distance.
|
|
63
|
-
|
|
66
|
+
|
|
64
67
|
Args:
|
|
65
68
|
response: The response to evaluate
|
|
66
69
|
answer: The expected answer
|
|
67
|
-
|
|
70
|
+
|
|
68
71
|
Returns:
|
|
69
72
|
EvaluationResult with score=similarity (0.0-1.0)
|
|
70
73
|
"""
|
|
71
74
|
s1 = str(response).lower()
|
|
72
75
|
s2 = str(answer).lower()
|
|
73
|
-
|
|
76
|
+
|
|
74
77
|
if s1 == s2:
|
|
75
78
|
score = 1.0
|
|
76
79
|
elif len(s1) == 0 or len(s2) == 0:
|
|
@@ -80,21 +83,19 @@ def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
|
|
|
80
83
|
distance = levenshtein.distance(s1, s2)
|
|
81
84
|
max_len = max(len(s1), len(s2))
|
|
82
85
|
score = 1.0 - (distance / max_len)
|
|
83
|
-
|
|
86
|
+
|
|
84
87
|
return EvaluationResult(
|
|
85
|
-
score=score,
|
|
86
|
-
reason=f"Fuzzy match with {score:.1%} similarity",
|
|
87
|
-
mode="fuzz"
|
|
88
|
+
score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
|
|
88
89
|
)
|
|
89
90
|
|
|
90
91
|
|
|
91
|
-
def match_regex(response:
|
|
92
|
+
def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
|
|
92
93
|
"""Check if response matches regex pattern.
|
|
93
|
-
|
|
94
|
+
|
|
94
95
|
Args:
|
|
95
96
|
response: The response to evaluate
|
|
96
97
|
pattern: Regular expression pattern to match
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
Returns:
|
|
99
100
|
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
100
101
|
"""
|
|
@@ -104,23 +105,19 @@ def match_regex(response: Any, pattern: str) -> EvaluationResult:
|
|
|
104
105
|
return EvaluationResult(
|
|
105
106
|
score=1.0 if passed else 0.0,
|
|
106
107
|
reason="Regex pattern matched" if passed else "Regex pattern did not match",
|
|
107
|
-
mode="regex"
|
|
108
|
+
mode="regex",
|
|
108
109
|
)
|
|
109
110
|
except re.error:
|
|
110
|
-
return EvaluationResult(
|
|
111
|
-
score=0.0,
|
|
112
|
-
reason="Invalid regex pattern",
|
|
113
|
-
mode="regex"
|
|
114
|
-
)
|
|
111
|
+
return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
|
|
115
112
|
|
|
116
113
|
|
|
117
|
-
def match_diff(response:
|
|
114
|
+
def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
118
115
|
"""Compare difference between response and answer.
|
|
119
|
-
|
|
116
|
+
|
|
120
117
|
Args:
|
|
121
118
|
response: The response to evaluate
|
|
122
119
|
answer: The expected answer
|
|
123
|
-
|
|
120
|
+
|
|
124
121
|
Returns:
|
|
125
122
|
EvaluationResult with score=similarity (0.0-1.0)
|
|
126
123
|
"""
|
|
@@ -130,34 +127,30 @@ def match_diff(response: Any, answer: Any) -> EvaluationResult:
|
|
|
130
127
|
else:
|
|
131
128
|
score = _match_string_diff(response, answer)
|
|
132
129
|
reason = f"String difference with {score:.1%} similarity"
|
|
133
|
-
|
|
134
|
-
return EvaluationResult(
|
|
135
|
-
score=score,
|
|
136
|
-
reason=reason,
|
|
137
|
-
mode="diff"
|
|
138
|
-
)
|
|
130
|
+
|
|
131
|
+
return EvaluationResult(score=score, reason=reason, mode="diff")
|
|
139
132
|
|
|
140
133
|
|
|
141
|
-
def _match_string_diff(response:
|
|
134
|
+
def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
|
|
142
135
|
"""Compare difference between response and answer strings."""
|
|
143
136
|
matcher = SequenceMatcher(None, str(response), str(answer))
|
|
144
137
|
return matcher.ratio()
|
|
145
|
-
|
|
138
|
+
|
|
146
139
|
|
|
147
140
|
def _match_numeric_diff(response: float, answer: float) -> float:
|
|
148
141
|
"""Calculate normalized difference between numeric values.
|
|
149
|
-
|
|
142
|
+
|
|
150
143
|
Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
|
|
151
144
|
"""
|
|
152
145
|
if response == answer:
|
|
153
146
|
return 1.0
|
|
154
|
-
|
|
147
|
+
|
|
155
148
|
# Simple absolute difference normalized to a 0-1 scale
|
|
156
149
|
diff = abs(response - answer)
|
|
157
150
|
max_val = max(abs(response), abs(answer))
|
|
158
|
-
|
|
151
|
+
|
|
159
152
|
if max_val == 0:
|
|
160
153
|
return 1.0 # Both are zero
|
|
161
|
-
|
|
154
|
+
|
|
162
155
|
# Normalize and invert so 1.0 means identical
|
|
163
156
|
return max(0.0, 1.0 - min(1.0, diff / max_val))
|
hud/evaluators/remote.py
CHANGED
|
@@ -9,19 +9,16 @@ from hud.settings import settings
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
async def _remote_eval_call(
|
|
12
|
-
response: Any,
|
|
13
|
-
answer: Any,
|
|
14
|
-
eval_type: str,
|
|
15
|
-
config: dict[str, Any] | None = None
|
|
12
|
+
response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
|
|
16
13
|
) -> dict[str, Any]:
|
|
17
14
|
"""Send an evaluation request to the remote server.
|
|
18
|
-
|
|
15
|
+
|
|
19
16
|
Args:
|
|
20
17
|
response: The response to evaluate
|
|
21
18
|
answer: The reference answer to compare against
|
|
22
19
|
eval_type: Type of evaluation (e.g., "match", "judge", "agent")
|
|
23
20
|
config: Optional configuration parameters
|
|
24
|
-
|
|
21
|
+
|
|
25
22
|
Returns:
|
|
26
23
|
Dictionary with evaluation results from the server
|
|
27
24
|
"""
|
|
@@ -33,46 +30,36 @@ async def _remote_eval_call(
|
|
|
33
30
|
"response": response,
|
|
34
31
|
"answer": answer,
|
|
35
32
|
"type": eval_type,
|
|
36
|
-
"config": config or {}
|
|
33
|
+
"config": config or {},
|
|
37
34
|
},
|
|
38
35
|
api_key=settings.api_key,
|
|
39
36
|
)
|
|
40
37
|
return result
|
|
41
38
|
except Exception as e:
|
|
42
|
-
return {
|
|
43
|
-
"score": -1.0,
|
|
44
|
-
"reason": f"Remote evaluation failed: {e!s}",
|
|
45
|
-
"details": {}
|
|
46
|
-
}
|
|
39
|
+
return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
|
|
47
40
|
|
|
48
41
|
|
|
49
42
|
def remote_evaluate(
|
|
50
|
-
response: Any,
|
|
51
|
-
answer: Any,
|
|
52
|
-
eval_type: str = "default",
|
|
53
|
-
config: dict[str, Any] | None = None
|
|
43
|
+
response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
|
|
54
44
|
) -> EvaluationResult:
|
|
55
45
|
"""Evaluate a response using remote evaluation services.
|
|
56
|
-
|
|
46
|
+
|
|
57
47
|
Args:
|
|
58
48
|
response: The response to evaluate
|
|
59
49
|
answer: The reference answer to compare against
|
|
60
50
|
eval_type: Type of evaluation to perform
|
|
61
51
|
config: Optional configuration for the evaluation
|
|
62
|
-
|
|
52
|
+
|
|
63
53
|
Returns:
|
|
64
54
|
EvaluationResult containing the evaluation results
|
|
65
55
|
"""
|
|
66
|
-
result = asyncio.run(
|
|
67
|
-
response=response,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
config=config
|
|
71
|
-
))
|
|
72
|
-
|
|
56
|
+
result = asyncio.run(
|
|
57
|
+
_remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
|
|
58
|
+
)
|
|
59
|
+
|
|
73
60
|
return EvaluationResult(
|
|
74
61
|
score=result.get("score", -1.0),
|
|
75
62
|
reason=result.get("reason", "Remote evaluation completed"),
|
|
76
63
|
mode=eval_type,
|
|
77
|
-
criteria_scores=result.get("details", {})
|
|
64
|
+
criteria_scores=result.get("details", {}),
|
|
78
65
|
)
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from hud.evaluators.inspect import inspect_evaluate
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_inspect_evaluate_basic():
|
|
7
|
+
"""Test basic functionality of inspect_evaluate."""
|
|
8
|
+
result = inspect_evaluate("Test response", "Test answer")
|
|
9
|
+
|
|
10
|
+
assert result.score == 0.0
|
|
11
|
+
assert result.reason == "Inspect evaluation not implemented"
|
|
12
|
+
assert result.mode == "inspect"
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import base64
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.evaluators.base import EvaluationResult
|
|
8
|
+
from hud.evaluators.judge import (
|
|
9
|
+
_call_eval_endpoint,
|
|
10
|
+
_evaluate_with_llm,
|
|
11
|
+
_is_base64_image,
|
|
12
|
+
_process_input,
|
|
13
|
+
judge,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _MockLLM:
|
|
18
|
+
"""Mock LLM for testing."""
|
|
19
|
+
|
|
20
|
+
def __init__(self, response_text):
|
|
21
|
+
self.response_text = response_text
|
|
22
|
+
|
|
23
|
+
async def ainvoke(self, _prompt: str) -> str:
|
|
24
|
+
return self.response_text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.mark.parametrize(
|
|
28
|
+
"input_data, expected_result",
|
|
29
|
+
[
|
|
30
|
+
("Hello world", "Hello world"),
|
|
31
|
+
(123, "123"),
|
|
32
|
+
(["Hello", "world"], ["Hello", "world"]),
|
|
33
|
+
({"key": "value"}, {"key": "value"}),
|
|
34
|
+
(b"Hello world", base64.b64encode(b"Hello world").decode("utf-8")),
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
def test_process_input(input_data, expected_result):
|
|
38
|
+
"""Test processing various input types."""
|
|
39
|
+
result = _process_input(input_data)
|
|
40
|
+
assert result == expected_result
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@pytest.mark.parametrize(
|
|
44
|
+
"input_data, expected_result",
|
|
45
|
+
[
|
|
46
|
+
("not an image", False),
|
|
47
|
+
("", True),
|
|
48
|
+
(b"not an image", False),
|
|
49
|
+
(123, False),
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
def test_is_base64_image(input_data, expected_result):
|
|
53
|
+
"""Test base64 image detection."""
|
|
54
|
+
assert _is_base64_image(input_data) == expected_result
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_is_base64_image_with_signatures(mocker):
|
|
58
|
+
"""Test base64 image detection with common image signatures."""
|
|
59
|
+
# Mock base64.b64decode to return different image signatures
|
|
60
|
+
mock_b64decode = mocker.patch("base64.b64decode")
|
|
61
|
+
|
|
62
|
+
# Test JPEG signature
|
|
63
|
+
mock_b64decode.return_value = b"\xff\xd8\xff" + b"some data"
|
|
64
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
65
|
+
|
|
66
|
+
# Test PNG signature
|
|
67
|
+
mock_b64decode.return_value = b"\x89PNG\r\n\x1a\n" + b"some data"
|
|
68
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
69
|
+
|
|
70
|
+
# Test GIF signature
|
|
71
|
+
mock_b64decode.return_value = b"GIF8" + b"some data"
|
|
72
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
73
|
+
|
|
74
|
+
# Test RIFF signature (WebP)
|
|
75
|
+
mock_b64decode.return_value = b"RIFF" + b"some data"
|
|
76
|
+
assert _is_base64_image("not_really_base64_but_mocked") is True
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@pytest.mark.asyncio
|
|
80
|
+
async def test_call_eval_endpoint_success(mocker):
|
|
81
|
+
"""Test successful remote evaluation call."""
|
|
82
|
+
mock_response = {
|
|
83
|
+
"score": 0.8,
|
|
84
|
+
"reason": "Good response",
|
|
85
|
+
"criteria_scores": {"relevance": 0.9, "accuracy": 0.7},
|
|
86
|
+
}
|
|
87
|
+
mock_make_request = mocker.patch(
|
|
88
|
+
"hud.evaluators.judge.make_request", return_value=mock_response
|
|
89
|
+
)
|
|
90
|
+
result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
|
|
91
|
+
assert result == mock_response
|
|
92
|
+
mock_make_request.assert_called_once()
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@pytest.mark.asyncio
|
|
96
|
+
async def test_call_eval_endpoint_failure(mocker):
|
|
97
|
+
"""Test remote evaluation call failure."""
|
|
98
|
+
mocker.patch("hud.evaluators.judge.make_request", side_effect=Exception("API error"))
|
|
99
|
+
result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
|
|
100
|
+
assert result["score"] == -1.0
|
|
101
|
+
assert "Remote evaluation failed" in result["reason"]
|
|
102
|
+
assert result["criteria_scores"] == {}
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_judge_without_llm(mocker):
|
|
106
|
+
"""Test judge function without custom LLM."""
|
|
107
|
+
mock_result = {
|
|
108
|
+
"score": 0.9,
|
|
109
|
+
"reason": "Good answer",
|
|
110
|
+
"criteria_scores": {"relevance": 1.0},
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
async def mock_endpoint(*args, **kwargs):
|
|
114
|
+
return mock_result
|
|
115
|
+
|
|
116
|
+
mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
|
|
117
|
+
result = judge("test response", "test answer")
|
|
118
|
+
|
|
119
|
+
assert result.score == 0.9
|
|
120
|
+
assert result.reason == "Good answer"
|
|
121
|
+
assert result.mode == "LLM"
|
|
122
|
+
assert result.criteria_scores == {"relevance": 1.0}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def test_judge_with_image_answer(mocker):
|
|
126
|
+
"""Test judge function with an image as the answer."""
|
|
127
|
+
mock_result = {
|
|
128
|
+
"score": 0.85,
|
|
129
|
+
"reason": "Good image analysis",
|
|
130
|
+
"criteria_scores": {"visual_accuracy": 0.85},
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
async def mock_endpoint(*args, **kwargs):
|
|
134
|
+
return mock_result
|
|
135
|
+
|
|
136
|
+
mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
|
|
137
|
+
|
|
138
|
+
# Create a mock image
|
|
139
|
+
image_data = b"fake_image_data"
|
|
140
|
+
base64_image = base64.b64encode(image_data).decode("utf-8")
|
|
141
|
+
image_uri = f"data:image/jpeg;base64,{base64_image}"
|
|
142
|
+
|
|
143
|
+
result = judge("description of image", image_uri)
|
|
144
|
+
|
|
145
|
+
assert result.score == 0.85
|
|
146
|
+
assert result.reason == "Good image analysis"
|
|
147
|
+
assert result.mode == "VLM" # Should use VLM mode for images
|
|
148
|
+
assert result.criteria_scores == {"visual_accuracy": 0.85}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def test_judge_with_llm(mocker):
|
|
152
|
+
"""Test judge function with custom LLM."""
|
|
153
|
+
mock_llm = _MockLLM('{"score": 0.75, "reason": "Pretty good"}')
|
|
154
|
+
mock_result = EvaluationResult(score=0.75, reason="Pretty good", mode="custom_llm")
|
|
155
|
+
mocker.patch("hud.evaluators.judge._evaluate_with_llm", return_value=mock_result)
|
|
156
|
+
result = judge("test response", "test answer", llm=mock_llm)
|
|
157
|
+
assert result.score == 0.75
|
|
158
|
+
assert result.reason == "Pretty good"
|
|
159
|
+
assert result.mode == "custom_llm"
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def test_evaluate_with_llm_valid_json():
|
|
163
|
+
"""Test _evaluate_with_llm with valid JSON response."""
|
|
164
|
+
llm = _MockLLM('{"score": 0.85, "reason": "The response is accurate and well-structured."}')
|
|
165
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
166
|
+
|
|
167
|
+
assert result.score == 0.85
|
|
168
|
+
assert result.reason == "The response is accurate and well-structured."
|
|
169
|
+
assert result.mode == "custom_llm"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_evaluate_with_llm_json_in_text():
|
|
173
|
+
"""Test _evaluate_with_llm with JSON embedded in text."""
|
|
174
|
+
llm_response = """
|
|
175
|
+
I've evaluated the response and here's my assessment:
|
|
176
|
+
|
|
177
|
+
{"score": 0.7, "reason": "Good but could be more detailed"}
|
|
178
|
+
|
|
179
|
+
I hope this helps!
|
|
180
|
+
"""
|
|
181
|
+
llm = _MockLLM(llm_response)
|
|
182
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
183
|
+
|
|
184
|
+
assert result.score == 0.7
|
|
185
|
+
assert result.reason == "Good but could be more detailed"
|
|
186
|
+
assert result.mode == "custom_llm"
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def test_evaluate_with_llm_invalid_json():
|
|
190
|
+
"""Test _evaluate_with_llm with invalid JSON response."""
|
|
191
|
+
llm = _MockLLM("This is not a JSON response")
|
|
192
|
+
result = _evaluate_with_llm("test response", "test answer", llm)
|
|
193
|
+
|
|
194
|
+
assert result.score == 0.5 # Default score for unparseable responses
|
|
195
|
+
assert "Unable to parse LLM response as JSON" in result.reason
|
|
196
|
+
assert result.mode == "custom_llm"
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def test_evaluate_with_llm_exception(mocker):
|
|
200
|
+
"""Test _evaluate_with_llm when an exception occurs."""
|
|
201
|
+
# Mock the LLM to raise an exception
|
|
202
|
+
failing_llm = _MockLLM("doesn't matter")
|
|
203
|
+
mocker.patch.object(failing_llm, "ainvoke", side_effect=Exception("LLM API error"))
|
|
204
|
+
|
|
205
|
+
result = _evaluate_with_llm("test response", "test answer", failing_llm)
|
|
206
|
+
|
|
207
|
+
assert result.score == 0.0 # Zero score for errors
|
|
208
|
+
assert "LLM evaluation error: LLM API error" in result.reason
|
|
209
|
+
assert result.mode == "custom_llm"
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def test_evaluate_with_llm_with_criteria():
|
|
213
|
+
"""Test _evaluate_with_llm with evaluation criteria."""
|
|
214
|
+
llm = _MockLLM('{"score": 0.9, "reason": "Excellent match on all criteria"}')
|
|
215
|
+
|
|
216
|
+
# Test with string criteria
|
|
217
|
+
string_criteria = ["Accuracy", "Relevance", "Completeness"]
|
|
218
|
+
result = _evaluate_with_llm("test response", "test answer", llm, criteria=string_criteria)
|
|
219
|
+
|
|
220
|
+
assert result.score == 0.9
|
|
221
|
+
assert result.reason == "Excellent match on all criteria"
|
|
222
|
+
|
|
223
|
+
# Test with dict criteria
|
|
224
|
+
dict_criteria = [
|
|
225
|
+
{"description": "Factual accuracy", "weight": 0.6},
|
|
226
|
+
{"description": "Grammar and spelling", "weight": 0.4},
|
|
227
|
+
]
|
|
228
|
+
result = _evaluate_with_llm("test response", "test answer", llm, criteria=dict_criteria)
|
|
229
|
+
|
|
230
|
+
assert result.score == 0.9
|
|
231
|
+
assert result.reason == "Excellent match on all criteria"
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from hud.evaluators.match import match_all, match_diff, match_fuzzy, match_regex, match_single
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@pytest.mark.parametrize(
|
|
9
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
10
|
+
[
|
|
11
|
+
("Hello, world!", "world", 1.0, "Exact match", "single"),
|
|
12
|
+
("Hello, world!", "not world", 0.0, "No exact match found", "single"),
|
|
13
|
+
],
|
|
14
|
+
)
|
|
15
|
+
def test_match_single(
|
|
16
|
+
response: str,
|
|
17
|
+
answer: str,
|
|
18
|
+
expected_score: float,
|
|
19
|
+
expected_reason: str,
|
|
20
|
+
expected_mode: str,
|
|
21
|
+
):
|
|
22
|
+
result = match_single(response, answer)
|
|
23
|
+
assert result.score == expected_score
|
|
24
|
+
assert result.reason == expected_reason
|
|
25
|
+
assert result.mode == expected_mode
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@pytest.mark.parametrize(
|
|
29
|
+
"response, answers, expected_score, expected_reason, expected_mode",
|
|
30
|
+
[
|
|
31
|
+
("Hello, world!", ["world", "hello"], 1.0, "All 2 expected items found", "all"),
|
|
32
|
+
("Hello, world!", ["world", "not hello"], 0.5, "Only 1 of 2 expected items found", "all"),
|
|
33
|
+
],
|
|
34
|
+
)
|
|
35
|
+
def test_match_all(
|
|
36
|
+
response: str,
|
|
37
|
+
answers: list[str],
|
|
38
|
+
expected_score: float,
|
|
39
|
+
expected_reason: str,
|
|
40
|
+
expected_mode: str,
|
|
41
|
+
):
|
|
42
|
+
result = match_all(response, answers)
|
|
43
|
+
assert result.score == expected_score
|
|
44
|
+
assert result.reason == expected_reason
|
|
45
|
+
assert result.mode == expected_mode
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@pytest.mark.parametrize(
|
|
49
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
50
|
+
[
|
|
51
|
+
("hello world", "hello world", 1.0, "Fuzzy match with 100.0% similarity", "fuzz"),
|
|
52
|
+
("hello wrld", "hello world", 0.9, "Fuzzy match with 90.9% similarity", "fuzz"),
|
|
53
|
+
("hello", "hello world", 0.45, "Fuzzy match with 45.5% similarity", "fuzz"),
|
|
54
|
+
("", "hello world", 0.0, "Fuzzy match with 0.0% similarity", "fuzz"),
|
|
55
|
+
],
|
|
56
|
+
)
|
|
57
|
+
def test_match_fuzzy(
|
|
58
|
+
response: str,
|
|
59
|
+
answer: str,
|
|
60
|
+
expected_score: float,
|
|
61
|
+
expected_reason: str,
|
|
62
|
+
expected_mode: str,
|
|
63
|
+
):
|
|
64
|
+
result = match_fuzzy(response, answer)
|
|
65
|
+
assert result.score == pytest.approx(expected_score, abs=1e-2)
|
|
66
|
+
assert result.reason == expected_reason
|
|
67
|
+
assert result.mode == expected_mode
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.parametrize(
|
|
71
|
+
"response, pattern, expected_score, expected_reason, expected_mode",
|
|
72
|
+
[
|
|
73
|
+
("hello world", r"hello.*", 1.0, "Regex pattern matched", "regex"),
|
|
74
|
+
("hello world", r"^hello.*$", 1.0, "Regex pattern matched", "regex"),
|
|
75
|
+
("hello world", r"world$", 1.0, "Regex pattern matched", "regex"),
|
|
76
|
+
("hello world", r"^goodbye.*$", 0.0, "Regex pattern did not match", "regex"),
|
|
77
|
+
("hello world", r"[invalid[", 0.0, "Invalid regex pattern", "regex"),
|
|
78
|
+
],
|
|
79
|
+
)
|
|
80
|
+
def test_match_regex(
|
|
81
|
+
response: str,
|
|
82
|
+
pattern: str,
|
|
83
|
+
expected_score: float,
|
|
84
|
+
expected_reason: str,
|
|
85
|
+
expected_mode: str,
|
|
86
|
+
):
|
|
87
|
+
result = match_regex(response, pattern)
|
|
88
|
+
assert result.score == expected_score
|
|
89
|
+
assert result.reason == expected_reason
|
|
90
|
+
assert result.mode == expected_mode
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@pytest.mark.parametrize(
|
|
94
|
+
"response, answer, expected_score, expected_reason, expected_mode",
|
|
95
|
+
[
|
|
96
|
+
("hello world", "hello world", 1.0, "String difference with 100.0% similarity", "diff"),
|
|
97
|
+
("hello", "hello world", 0.625, "String difference with 62.5% similarity", "diff"),
|
|
98
|
+
("", "hello world", 0.0, "String difference with 0.0% similarity", "diff"),
|
|
99
|
+
(100, 100, 1.0, "Numeric difference: 0", "diff"),
|
|
100
|
+
(90, 100, 0.9, "Numeric difference: 10", "diff"),
|
|
101
|
+
(0, 100, 0.0, "Numeric difference: 100", "diff"),
|
|
102
|
+
(-100, 100, 0.0, "Numeric difference: 200", "diff"),
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
def test_match_diff(
|
|
106
|
+
response: str | int | float,
|
|
107
|
+
answer: str | int | float,
|
|
108
|
+
expected_score: float,
|
|
109
|
+
expected_reason: str,
|
|
110
|
+
expected_mode: str,
|
|
111
|
+
):
|
|
112
|
+
result = match_diff(response, answer)
|
|
113
|
+
assert result.score == pytest.approx(expected_score, abs=1e-2)
|
|
114
|
+
assert result.reason == expected_reason
|
|
115
|
+
assert result.mode == expected_mode
|