hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +5 -3
- hud/adapters/__init__.py +2 -1
- hud/adapters/claude/adapter.py +13 -17
- hud/adapters/common/adapter.py +3 -3
- hud/adapters/common/tests/__init__.py +0 -0
- hud/adapters/common/tests/test_adapter.py +277 -0
- hud/adapters/common/types.py +3 -6
- hud/adapters/operator/adapter.py +22 -29
- hud/agent/__init__.py +9 -1
- hud/agent/base.py +28 -28
- hud/agent/claude.py +69 -60
- hud/agent/langchain.py +204 -0
- hud/agent/operator.py +75 -67
- hud/env/__init__.py +5 -5
- hud/env/client.py +2 -2
- hud/env/docker_client.py +37 -39
- hud/env/environment.py +91 -66
- hud/env/local_docker_client.py +5 -7
- hud/env/remote_client.py +40 -29
- hud/env/remote_docker_client.py +13 -3
- hud/evaluators/__init__.py +2 -3
- hud/evaluators/base.py +4 -3
- hud/evaluators/inspect.py +3 -8
- hud/evaluators/judge.py +34 -58
- hud/evaluators/match.py +42 -49
- hud/evaluators/remote.py +13 -26
- hud/evaluators/tests/__init__.py +0 -0
- hud/evaluators/tests/test_inspect.py +12 -0
- hud/evaluators/tests/test_judge.py +231 -0
- hud/evaluators/tests/test_match.py +115 -0
- hud/evaluators/tests/test_remote.py +98 -0
- hud/exceptions.py +167 -0
- hud/gym.py +12 -10
- hud/job.py +525 -47
- hud/server/__init__.py +2 -2
- hud/server/requests.py +148 -186
- hud/server/tests/__init__.py +0 -0
- hud/server/tests/test_requests.py +275 -0
- hud/settings.py +3 -2
- hud/task.py +12 -22
- hud/taskset.py +44 -11
- hud/trajectory.py +6 -9
- hud/types.py +14 -9
- hud/utils/__init__.py +2 -2
- hud/utils/common.py +37 -13
- hud/utils/config.py +44 -29
- hud/utils/progress.py +149 -0
- hud/utils/telemetry.py +10 -11
- hud/utils/tests/__init__.py +0 -0
- hud/utils/tests/test_common.py +52 -0
- hud/utils/tests/test_config.py +129 -0
- hud/utils/tests/test_progress.py +225 -0
- hud/utils/tests/test_telemetry.py +37 -0
- hud/utils/tests/test_version.py +8 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
- hud_python-0.2.3.dist-info/RECORD +62 -0
- hud_python-0.2.1.dist-info/RECORD +0 -44
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
- {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/evaluators/judge.py
CHANGED
|
@@ -11,33 +11,26 @@ from hud.settings import settings
|
|
|
11
11
|
|
|
12
12
|
class LLM(Protocol):
|
|
13
13
|
"""Protocol for LLM interfaces that can be used for evaluation."""
|
|
14
|
-
|
|
14
|
+
|
|
15
|
+
async def ainvoke(self, prompt: str, /) -> str: ...
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class Criterion(TypedDict, total=False):
|
|
18
19
|
"""Criterion for judge-based evaluation."""
|
|
19
|
-
|
|
20
|
+
|
|
20
21
|
description: str
|
|
21
22
|
weight: float
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
async def _call_eval_endpoint(
|
|
25
|
-
response: Any,
|
|
26
|
-
answer: Any,
|
|
27
|
-
criteria: list[Any],
|
|
28
|
-
mode: str
|
|
26
|
+
response: Any, answer: Any, criteria: list[Any], mode: str
|
|
29
27
|
) -> dict[str, Any]:
|
|
30
28
|
"""Call the run_eval endpoint to evaluate the response."""
|
|
31
29
|
try:
|
|
32
30
|
result = await make_request(
|
|
33
31
|
method="POST",
|
|
34
32
|
url=f"{settings.base_url}/evaluations/run_eval",
|
|
35
|
-
json={
|
|
36
|
-
"response": response,
|
|
37
|
-
"answer": answer,
|
|
38
|
-
"criteria": criteria,
|
|
39
|
-
"mode": mode
|
|
40
|
-
},
|
|
33
|
+
json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
|
|
41
34
|
api_key=settings.api_key,
|
|
42
35
|
)
|
|
43
36
|
return result
|
|
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
|
|
|
46
39
|
return {
|
|
47
40
|
"score": -1.0,
|
|
48
41
|
"reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
|
|
49
|
-
"criteria_scores": {}
|
|
42
|
+
"criteria_scores": {},
|
|
50
43
|
}
|
|
51
44
|
|
|
52
45
|
|
|
53
|
-
def _determine_mode(answer: Any) -> str:
|
|
54
|
-
"""Determine the evaluation mode based on answer type."""
|
|
55
|
-
if isinstance(answer, bytes) or _is_base64_image(answer):
|
|
56
|
-
return "VLM"
|
|
57
|
-
return "LLM"
|
|
58
|
-
|
|
59
|
-
|
|
60
46
|
def _process_input(data: Any) -> Any:
|
|
61
47
|
"""Process input data, detecting and handling base64 images."""
|
|
62
48
|
if isinstance(data, bytes):
|
|
63
49
|
# Convert bytes to base64 string
|
|
64
50
|
return base64.b64encode(data).decode("utf-8")
|
|
65
|
-
|
|
51
|
+
|
|
66
52
|
if isinstance(data, str) and _is_base64_image(data):
|
|
67
53
|
# It's already a base64 string, just return it
|
|
68
54
|
return data
|
|
69
|
-
|
|
55
|
+
|
|
70
56
|
if isinstance(data, list) and all(isinstance(item, str) for item in data):
|
|
71
57
|
# Process list of strings
|
|
72
58
|
return data
|
|
73
|
-
|
|
59
|
+
|
|
74
60
|
# For other types, convert to string
|
|
75
61
|
return str(data) if not isinstance(data, str | dict) else data
|
|
76
62
|
|
|
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
|
|
|
79
65
|
"""Check if a string is a base64 encoded image."""
|
|
80
66
|
if not isinstance(data, str):
|
|
81
67
|
return False
|
|
82
|
-
|
|
68
|
+
|
|
83
69
|
# Check for common image data URI pattern
|
|
84
70
|
if data.startswith(("data:image/", "data:application/octet-stream")):
|
|
85
71
|
return True
|
|
86
|
-
|
|
72
|
+
|
|
87
73
|
# Check if it's a base64 encoded string with image header
|
|
88
74
|
try:
|
|
89
75
|
# First, validate it's base64 decodable
|
|
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
|
|
|
95
81
|
sample = base64.b64decode(data[:30])
|
|
96
82
|
|
|
97
83
|
# Check for common image format signatures
|
|
98
|
-
return (
|
|
99
|
-
sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
|
|
100
|
-
)
|
|
84
|
+
return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
|
|
101
85
|
except Exception:
|
|
102
86
|
return False
|
|
103
87
|
|
|
@@ -109,50 +93,46 @@ def judge(
|
|
|
109
93
|
criteria: list[str] | list[dict] | None = None,
|
|
110
94
|
) -> EvaluationResult:
|
|
111
95
|
"""Judge a response against an answer using an LLM.
|
|
112
|
-
|
|
96
|
+
|
|
113
97
|
Args:
|
|
114
98
|
response: The response to evaluate
|
|
115
99
|
answer: The reference answer to compare against
|
|
116
100
|
llm: Optional langchain LLM to use for evaluation
|
|
117
101
|
criteria: Evaluation criteria as strings or dictionaries
|
|
118
|
-
|
|
102
|
+
|
|
119
103
|
Returns:
|
|
120
104
|
EvaluationResult with evaluation results
|
|
121
105
|
"""
|
|
122
106
|
# Process inputs
|
|
123
107
|
processed_response = _process_input(response)
|
|
124
108
|
processed_answer = _process_input(answer)
|
|
125
|
-
|
|
109
|
+
|
|
126
110
|
# If LLM is provided, use it for evaluation
|
|
127
111
|
if llm:
|
|
128
112
|
return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
|
|
129
|
-
|
|
113
|
+
|
|
130
114
|
# Otherwise, use the remote evaluation service
|
|
131
115
|
mode = "LLM"
|
|
132
116
|
if isinstance(answer, bytes) or _is_base64_image(answer):
|
|
133
117
|
mode = "VLM"
|
|
134
|
-
|
|
118
|
+
|
|
135
119
|
# Call the eval endpoint synchronously
|
|
136
|
-
result = asyncio.run(
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
120
|
+
result = asyncio.run(
|
|
121
|
+
_call_eval_endpoint(
|
|
122
|
+
response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
143
126
|
return EvaluationResult(
|
|
144
127
|
score=result.get("score", -1.0),
|
|
145
128
|
reason=result.get("reason", "Response evaluated"),
|
|
146
129
|
mode=mode,
|
|
147
|
-
criteria_scores=result.get("criteria_scores", {})
|
|
130
|
+
criteria_scores=result.get("criteria_scores", {}),
|
|
148
131
|
)
|
|
149
132
|
|
|
150
133
|
|
|
151
134
|
def _evaluate_with_llm(
|
|
152
|
-
response: Any,
|
|
153
|
-
answer: Any,
|
|
154
|
-
llm: LLM,
|
|
155
|
-
criteria: list[str] | list[dict] | None = None
|
|
135
|
+
response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
|
|
156
136
|
) -> EvaluationResult:
|
|
157
137
|
"""Evaluate a response against an answer using a provided LLM."""
|
|
158
138
|
criteria_text = ""
|
|
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
|
|
|
163
143
|
criteria_text += f"- {c['description']}\n"
|
|
164
144
|
elif isinstance(c, str):
|
|
165
145
|
criteria_text += f"- {c}\n"
|
|
166
|
-
|
|
146
|
+
|
|
167
147
|
prompt = f"""Evaluate the quality of a response given a reference answer.
|
|
168
148
|
|
|
169
149
|
REFERENCE ANSWER:
|
|
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
|
|
|
181
161
|
try:
|
|
182
162
|
# Run the evaluation asynchronously
|
|
183
163
|
result_text = asyncio.run(llm.ainvoke(prompt))
|
|
184
|
-
|
|
164
|
+
|
|
185
165
|
# Attempt to parse JSON response
|
|
186
166
|
import json
|
|
187
167
|
import re
|
|
188
|
-
|
|
168
|
+
|
|
189
169
|
# Try to extract JSON if wrapped in other text
|
|
190
170
|
json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
|
|
191
171
|
if json_match:
|
|
192
172
|
json_str = json_match.group(0)
|
|
193
173
|
result = json.loads(json_str)
|
|
194
|
-
|
|
174
|
+
|
|
195
175
|
return EvaluationResult(
|
|
196
176
|
score=float(result.get("score", 0.5)),
|
|
197
177
|
reason=result.get("reason", "Evaluated with custom LLM"),
|
|
198
|
-
mode="custom_llm"
|
|
178
|
+
mode="custom_llm",
|
|
199
179
|
)
|
|
200
|
-
|
|
180
|
+
|
|
201
181
|
# If can't parse as JSON, use default values
|
|
202
182
|
return EvaluationResult(
|
|
203
183
|
score=0.5,
|
|
204
184
|
reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
|
|
205
|
-
mode="custom_llm"
|
|
185
|
+
mode="custom_llm",
|
|
206
186
|
)
|
|
207
|
-
|
|
187
|
+
|
|
208
188
|
except Exception as e:
|
|
209
|
-
return EvaluationResult(
|
|
210
|
-
score=0.0,
|
|
211
|
-
reason=f"LLM evaluation error: {e!s}",
|
|
212
|
-
mode="custom_llm"
|
|
213
|
-
)
|
|
189
|
+
return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
|
hud/evaluators/match.py
CHANGED
|
@@ -2,20 +2,27 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
4
|
from difflib import SequenceMatcher
|
|
5
|
-
from typing import
|
|
5
|
+
from typing import TYPE_CHECKING, Protocol
|
|
6
6
|
|
|
7
7
|
from textdistance import levenshtein
|
|
8
8
|
|
|
9
9
|
from hud.evaluators.base import EvaluationResult
|
|
10
10
|
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
11
13
|
|
|
12
|
-
|
|
14
|
+
|
|
15
|
+
class _Stringable(Protocol):
|
|
16
|
+
def __str__(self) -> str: ...
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
13
20
|
"""Check if the answer is present within the response.
|
|
14
|
-
|
|
21
|
+
|
|
15
22
|
Args:
|
|
16
23
|
response: The response to evaluate
|
|
17
24
|
answer: The expected answer
|
|
18
|
-
|
|
25
|
+
|
|
19
26
|
Returns:
|
|
20
27
|
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
21
28
|
"""
|
|
@@ -23,54 +30,50 @@ def match_single(response: Any, answer: Any) -> EvaluationResult:
|
|
|
23
30
|
return EvaluationResult(
|
|
24
31
|
score=1.0 if passed else 0.0,
|
|
25
32
|
reason="Exact match" if passed else "No exact match found",
|
|
26
|
-
mode="single"
|
|
33
|
+
mode="single",
|
|
27
34
|
)
|
|
28
35
|
|
|
29
36
|
|
|
30
|
-
def match_all(response:
|
|
37
|
+
def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
|
|
31
38
|
"""Count how many expected answers are in the response.
|
|
32
|
-
|
|
39
|
+
|
|
33
40
|
Args:
|
|
34
41
|
response: The response to evaluate
|
|
35
42
|
answers: List of expected answers
|
|
36
|
-
|
|
43
|
+
|
|
37
44
|
Returns:
|
|
38
45
|
EvaluationResult with score=proportion of matches (0.0-1.0)
|
|
39
46
|
"""
|
|
40
47
|
response_str = str(response).lower()
|
|
41
48
|
matches = 0
|
|
42
|
-
|
|
49
|
+
|
|
43
50
|
for answer in answers:
|
|
44
51
|
if str(answer).lower() in response_str:
|
|
45
52
|
matches += 1
|
|
46
|
-
|
|
53
|
+
|
|
47
54
|
score = matches / len(answers) if answers else 0.0
|
|
48
|
-
|
|
55
|
+
|
|
49
56
|
if matches == len(answers):
|
|
50
57
|
reason = f"All {matches} expected items found"
|
|
51
58
|
else:
|
|
52
59
|
reason = f"Only {matches} of {len(answers)} expected items found"
|
|
53
|
-
|
|
54
|
-
return EvaluationResult(
|
|
55
|
-
score=score,
|
|
56
|
-
reason=reason,
|
|
57
|
-
mode="all"
|
|
58
|
-
)
|
|
59
60
|
|
|
61
|
+
return EvaluationResult(score=score, reason=reason, mode="all")
|
|
60
62
|
|
|
61
|
-
|
|
63
|
+
|
|
64
|
+
def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
62
65
|
"""Calculate similarity using Levenshtein distance.
|
|
63
|
-
|
|
66
|
+
|
|
64
67
|
Args:
|
|
65
68
|
response: The response to evaluate
|
|
66
69
|
answer: The expected answer
|
|
67
|
-
|
|
70
|
+
|
|
68
71
|
Returns:
|
|
69
72
|
EvaluationResult with score=similarity (0.0-1.0)
|
|
70
73
|
"""
|
|
71
74
|
s1 = str(response).lower()
|
|
72
75
|
s2 = str(answer).lower()
|
|
73
|
-
|
|
76
|
+
|
|
74
77
|
if s1 == s2:
|
|
75
78
|
score = 1.0
|
|
76
79
|
elif len(s1) == 0 or len(s2) == 0:
|
|
@@ -80,21 +83,19 @@ def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
|
|
|
80
83
|
distance = levenshtein.distance(s1, s2)
|
|
81
84
|
max_len = max(len(s1), len(s2))
|
|
82
85
|
score = 1.0 - (distance / max_len)
|
|
83
|
-
|
|
86
|
+
|
|
84
87
|
return EvaluationResult(
|
|
85
|
-
score=score,
|
|
86
|
-
reason=f"Fuzzy match with {score:.1%} similarity",
|
|
87
|
-
mode="fuzz"
|
|
88
|
+
score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
|
|
88
89
|
)
|
|
89
90
|
|
|
90
91
|
|
|
91
|
-
def match_regex(response:
|
|
92
|
+
def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
|
|
92
93
|
"""Check if response matches regex pattern.
|
|
93
|
-
|
|
94
|
+
|
|
94
95
|
Args:
|
|
95
96
|
response: The response to evaluate
|
|
96
97
|
pattern: Regular expression pattern to match
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
Returns:
|
|
99
100
|
EvaluationResult with score=1.0 if match, 0.0 otherwise
|
|
100
101
|
"""
|
|
@@ -104,23 +105,19 @@ def match_regex(response: Any, pattern: str) -> EvaluationResult:
|
|
|
104
105
|
return EvaluationResult(
|
|
105
106
|
score=1.0 if passed else 0.0,
|
|
106
107
|
reason="Regex pattern matched" if passed else "Regex pattern did not match",
|
|
107
|
-
mode="regex"
|
|
108
|
+
mode="regex",
|
|
108
109
|
)
|
|
109
110
|
except re.error:
|
|
110
|
-
return EvaluationResult(
|
|
111
|
-
score=0.0,
|
|
112
|
-
reason="Invalid regex pattern",
|
|
113
|
-
mode="regex"
|
|
114
|
-
)
|
|
111
|
+
return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
|
|
115
112
|
|
|
116
113
|
|
|
117
|
-
def match_diff(response:
|
|
114
|
+
def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
|
|
118
115
|
"""Compare difference between response and answer.
|
|
119
|
-
|
|
116
|
+
|
|
120
117
|
Args:
|
|
121
118
|
response: The response to evaluate
|
|
122
119
|
answer: The expected answer
|
|
123
|
-
|
|
120
|
+
|
|
124
121
|
Returns:
|
|
125
122
|
EvaluationResult with score=similarity (0.0-1.0)
|
|
126
123
|
"""
|
|
@@ -130,34 +127,30 @@ def match_diff(response: Any, answer: Any) -> EvaluationResult:
|
|
|
130
127
|
else:
|
|
131
128
|
score = _match_string_diff(response, answer)
|
|
132
129
|
reason = f"String difference with {score:.1%} similarity"
|
|
133
|
-
|
|
134
|
-
return EvaluationResult(
|
|
135
|
-
score=score,
|
|
136
|
-
reason=reason,
|
|
137
|
-
mode="diff"
|
|
138
|
-
)
|
|
130
|
+
|
|
131
|
+
return EvaluationResult(score=score, reason=reason, mode="diff")
|
|
139
132
|
|
|
140
133
|
|
|
141
|
-
def _match_string_diff(response:
|
|
134
|
+
def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
|
|
142
135
|
"""Compare difference between response and answer strings."""
|
|
143
136
|
matcher = SequenceMatcher(None, str(response), str(answer))
|
|
144
137
|
return matcher.ratio()
|
|
145
|
-
|
|
138
|
+
|
|
146
139
|
|
|
147
140
|
def _match_numeric_diff(response: float, answer: float) -> float:
|
|
148
141
|
"""Calculate normalized difference between numeric values.
|
|
149
|
-
|
|
142
|
+
|
|
150
143
|
Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
|
|
151
144
|
"""
|
|
152
145
|
if response == answer:
|
|
153
146
|
return 1.0
|
|
154
|
-
|
|
147
|
+
|
|
155
148
|
# Simple absolute difference normalized to a 0-1 scale
|
|
156
149
|
diff = abs(response - answer)
|
|
157
150
|
max_val = max(abs(response), abs(answer))
|
|
158
|
-
|
|
151
|
+
|
|
159
152
|
if max_val == 0:
|
|
160
153
|
return 1.0 # Both are zero
|
|
161
|
-
|
|
154
|
+
|
|
162
155
|
# Normalize and invert so 1.0 means identical
|
|
163
156
|
return max(0.0, 1.0 - min(1.0, diff / max_val))
|
hud/evaluators/remote.py
CHANGED
|
@@ -9,19 +9,16 @@ from hud.settings import settings
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
async def _remote_eval_call(
|
|
12
|
-
response: Any,
|
|
13
|
-
answer: Any,
|
|
14
|
-
eval_type: str,
|
|
15
|
-
config: dict[str, Any] | None = None
|
|
12
|
+
response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
|
|
16
13
|
) -> dict[str, Any]:
|
|
17
14
|
"""Send an evaluation request to the remote server.
|
|
18
|
-
|
|
15
|
+
|
|
19
16
|
Args:
|
|
20
17
|
response: The response to evaluate
|
|
21
18
|
answer: The reference answer to compare against
|
|
22
19
|
eval_type: Type of evaluation (e.g., "match", "judge", "agent")
|
|
23
20
|
config: Optional configuration parameters
|
|
24
|
-
|
|
21
|
+
|
|
25
22
|
Returns:
|
|
26
23
|
Dictionary with evaluation results from the server
|
|
27
24
|
"""
|
|
@@ -33,46 +30,36 @@ async def _remote_eval_call(
|
|
|
33
30
|
"response": response,
|
|
34
31
|
"answer": answer,
|
|
35
32
|
"type": eval_type,
|
|
36
|
-
"config": config or {}
|
|
33
|
+
"config": config or {},
|
|
37
34
|
},
|
|
38
35
|
api_key=settings.api_key,
|
|
39
36
|
)
|
|
40
37
|
return result
|
|
41
38
|
except Exception as e:
|
|
42
|
-
return {
|
|
43
|
-
"score": -1.0,
|
|
44
|
-
"reason": f"Remote evaluation failed: {e!s}",
|
|
45
|
-
"details": {}
|
|
46
|
-
}
|
|
39
|
+
return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
|
|
47
40
|
|
|
48
41
|
|
|
49
42
|
def remote_evaluate(
|
|
50
|
-
response: Any,
|
|
51
|
-
answer: Any,
|
|
52
|
-
eval_type: str = "default",
|
|
53
|
-
config: dict[str, Any] | None = None
|
|
43
|
+
response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
|
|
54
44
|
) -> EvaluationResult:
|
|
55
45
|
"""Evaluate a response using remote evaluation services.
|
|
56
|
-
|
|
46
|
+
|
|
57
47
|
Args:
|
|
58
48
|
response: The response to evaluate
|
|
59
49
|
answer: The reference answer to compare against
|
|
60
50
|
eval_type: Type of evaluation to perform
|
|
61
51
|
config: Optional configuration for the evaluation
|
|
62
|
-
|
|
52
|
+
|
|
63
53
|
Returns:
|
|
64
54
|
EvaluationResult containing the evaluation results
|
|
65
55
|
"""
|
|
66
|
-
result = asyncio.run(
|
|
67
|
-
response=response,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
config=config
|
|
71
|
-
))
|
|
72
|
-
|
|
56
|
+
result = asyncio.run(
|
|
57
|
+
_remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
|
|
58
|
+
)
|
|
59
|
+
|
|
73
60
|
return EvaluationResult(
|
|
74
61
|
score=result.get("score", -1.0),
|
|
75
62
|
reason=result.get("reason", "Remote evaluation completed"),
|
|
76
63
|
mode=eval_type,
|
|
77
|
-
criteria_scores=result.get("details", {})
|
|
64
|
+
criteria_scores=result.get("details", {}),
|
|
78
65
|
)
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from hud.evaluators.inspect import inspect_evaluate
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def test_inspect_evaluate_basic():
|
|
7
|
+
"""Test basic functionality of inspect_evaluate."""
|
|
8
|
+
result = inspect_evaluate("Test response", "Test answer")
|
|
9
|
+
|
|
10
|
+
assert result.score == 0.0
|
|
11
|
+
assert result.reason == "Inspect evaluation not implemented"
|
|
12
|
+
assert result.mode == "inspect"
|