hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show
  1. hud/__init__.py +5 -3
  2. hud/adapters/__init__.py +2 -1
  3. hud/adapters/claude/adapter.py +13 -17
  4. hud/adapters/common/adapter.py +3 -3
  5. hud/adapters/common/tests/__init__.py +0 -0
  6. hud/adapters/common/tests/test_adapter.py +277 -0
  7. hud/adapters/common/types.py +3 -6
  8. hud/adapters/operator/adapter.py +22 -29
  9. hud/agent/__init__.py +9 -1
  10. hud/agent/base.py +28 -28
  11. hud/agent/claude.py +69 -60
  12. hud/agent/langchain.py +204 -0
  13. hud/agent/operator.py +75 -67
  14. hud/env/__init__.py +5 -5
  15. hud/env/client.py +2 -2
  16. hud/env/docker_client.py +37 -39
  17. hud/env/environment.py +91 -66
  18. hud/env/local_docker_client.py +5 -7
  19. hud/env/remote_client.py +40 -29
  20. hud/env/remote_docker_client.py +13 -3
  21. hud/evaluators/__init__.py +2 -3
  22. hud/evaluators/base.py +4 -3
  23. hud/evaluators/inspect.py +3 -8
  24. hud/evaluators/judge.py +34 -58
  25. hud/evaluators/match.py +42 -49
  26. hud/evaluators/remote.py +13 -26
  27. hud/evaluators/tests/__init__.py +0 -0
  28. hud/evaluators/tests/test_inspect.py +12 -0
  29. hud/evaluators/tests/test_judge.py +231 -0
  30. hud/evaluators/tests/test_match.py +115 -0
  31. hud/evaluators/tests/test_remote.py +98 -0
  32. hud/exceptions.py +167 -0
  33. hud/gym.py +12 -10
  34. hud/job.py +525 -47
  35. hud/server/__init__.py +2 -2
  36. hud/server/requests.py +148 -186
  37. hud/server/tests/__init__.py +0 -0
  38. hud/server/tests/test_requests.py +275 -0
  39. hud/settings.py +3 -2
  40. hud/task.py +12 -22
  41. hud/taskset.py +44 -11
  42. hud/trajectory.py +6 -9
  43. hud/types.py +14 -9
  44. hud/utils/__init__.py +2 -2
  45. hud/utils/common.py +37 -13
  46. hud/utils/config.py +44 -29
  47. hud/utils/progress.py +149 -0
  48. hud/utils/telemetry.py +10 -11
  49. hud/utils/tests/__init__.py +0 -0
  50. hud/utils/tests/test_common.py +52 -0
  51. hud/utils/tests/test_config.py +129 -0
  52. hud/utils/tests/test_progress.py +225 -0
  53. hud/utils/tests/test_telemetry.py +37 -0
  54. hud/utils/tests/test_version.py +8 -0
  55. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
  56. hud_python-0.2.3.dist-info/RECORD +62 -0
  57. hud_python-0.2.1.dist-info/RECORD +0 -44
  58. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
  59. {hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0
hud/evaluators/judge.py CHANGED
@@ -11,33 +11,26 @@ from hud.settings import settings
11
11
 
12
12
  class LLM(Protocol):
13
13
  """Protocol for LLM interfaces that can be used for evaluation."""
14
- async def ainvoke(self, prompt: str) -> str: ...
14
+
15
+ async def ainvoke(self, prompt: str, /) -> str: ...
15
16
 
16
17
 
17
18
  class Criterion(TypedDict, total=False):
18
19
  """Criterion for judge-based evaluation."""
19
-
20
+
20
21
  description: str
21
22
  weight: float
22
23
 
23
24
 
24
25
  async def _call_eval_endpoint(
25
- response: Any,
26
- answer: Any,
27
- criteria: list[Any],
28
- mode: str
26
+ response: Any, answer: Any, criteria: list[Any], mode: str
29
27
  ) -> dict[str, Any]:
30
28
  """Call the run_eval endpoint to evaluate the response."""
31
29
  try:
32
30
  result = await make_request(
33
31
  method="POST",
34
32
  url=f"{settings.base_url}/evaluations/run_eval",
35
- json={
36
- "response": response,
37
- "answer": answer,
38
- "criteria": criteria,
39
- "mode": mode
40
- },
33
+ json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
41
34
  api_key=settings.api_key,
42
35
  )
43
36
  return result
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
46
39
  return {
47
40
  "score": -1.0,
48
41
  "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
49
- "criteria_scores": {}
42
+ "criteria_scores": {},
50
43
  }
51
44
 
52
45
 
53
- def _determine_mode(answer: Any) -> str:
54
- """Determine the evaluation mode based on answer type."""
55
- if isinstance(answer, bytes) or _is_base64_image(answer):
56
- return "VLM"
57
- return "LLM"
58
-
59
-
60
46
  def _process_input(data: Any) -> Any:
61
47
  """Process input data, detecting and handling base64 images."""
62
48
  if isinstance(data, bytes):
63
49
  # Convert bytes to base64 string
64
50
  return base64.b64encode(data).decode("utf-8")
65
-
51
+
66
52
  if isinstance(data, str) and _is_base64_image(data):
67
53
  # It's already a base64 string, just return it
68
54
  return data
69
-
55
+
70
56
  if isinstance(data, list) and all(isinstance(item, str) for item in data):
71
57
  # Process list of strings
72
58
  return data
73
-
59
+
74
60
  # For other types, convert to string
75
61
  return str(data) if not isinstance(data, str | dict) else data
76
62
 
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
79
65
  """Check if a string is a base64 encoded image."""
80
66
  if not isinstance(data, str):
81
67
  return False
82
-
68
+
83
69
  # Check for common image data URI pattern
84
70
  if data.startswith(("data:image/", "data:application/octet-stream")):
85
71
  return True
86
-
72
+
87
73
  # Check if it's a base64 encoded string with image header
88
74
  try:
89
75
  # First, validate it's base64 decodable
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
95
81
  sample = base64.b64decode(data[:30])
96
82
 
97
83
  # Check for common image format signatures
98
- return (
99
- sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
100
- )
84
+ return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
101
85
  except Exception:
102
86
  return False
103
87
 
@@ -109,50 +93,46 @@ def judge(
109
93
  criteria: list[str] | list[dict] | None = None,
110
94
  ) -> EvaluationResult:
111
95
  """Judge a response against an answer using an LLM.
112
-
96
+
113
97
  Args:
114
98
  response: The response to evaluate
115
99
  answer: The reference answer to compare against
116
100
  llm: Optional langchain LLM to use for evaluation
117
101
  criteria: Evaluation criteria as strings or dictionaries
118
-
102
+
119
103
  Returns:
120
104
  EvaluationResult with evaluation results
121
105
  """
122
106
  # Process inputs
123
107
  processed_response = _process_input(response)
124
108
  processed_answer = _process_input(answer)
125
-
109
+
126
110
  # If LLM is provided, use it for evaluation
127
111
  if llm:
128
112
  return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
129
-
113
+
130
114
  # Otherwise, use the remote evaluation service
131
115
  mode = "LLM"
132
116
  if isinstance(answer, bytes) or _is_base64_image(answer):
133
117
  mode = "VLM"
134
-
118
+
135
119
  # Call the eval endpoint synchronously
136
- result = asyncio.run(_call_eval_endpoint(
137
- response=processed_response,
138
- answer=processed_answer,
139
- criteria=criteria or [],
140
- mode=mode
141
- ))
142
-
120
+ result = asyncio.run(
121
+ _call_eval_endpoint(
122
+ response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
123
+ )
124
+ )
125
+
143
126
  return EvaluationResult(
144
127
  score=result.get("score", -1.0),
145
128
  reason=result.get("reason", "Response evaluated"),
146
129
  mode=mode,
147
- criteria_scores=result.get("criteria_scores", {})
130
+ criteria_scores=result.get("criteria_scores", {}),
148
131
  )
149
132
 
150
133
 
151
134
  def _evaluate_with_llm(
152
- response: Any,
153
- answer: Any,
154
- llm: LLM,
155
- criteria: list[str] | list[dict] | None = None
135
+ response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
156
136
  ) -> EvaluationResult:
157
137
  """Evaluate a response against an answer using a provided LLM."""
158
138
  criteria_text = ""
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
163
143
  criteria_text += f"- {c['description']}\n"
164
144
  elif isinstance(c, str):
165
145
  criteria_text += f"- {c}\n"
166
-
146
+
167
147
  prompt = f"""Evaluate the quality of a response given a reference answer.
168
148
 
169
149
  REFERENCE ANSWER:
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
181
161
  try:
182
162
  # Run the evaluation asynchronously
183
163
  result_text = asyncio.run(llm.ainvoke(prompt))
184
-
164
+
185
165
  # Attempt to parse JSON response
186
166
  import json
187
167
  import re
188
-
168
+
189
169
  # Try to extract JSON if wrapped in other text
190
170
  json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
191
171
  if json_match:
192
172
  json_str = json_match.group(0)
193
173
  result = json.loads(json_str)
194
-
174
+
195
175
  return EvaluationResult(
196
176
  score=float(result.get("score", 0.5)),
197
177
  reason=result.get("reason", "Evaluated with custom LLM"),
198
- mode="custom_llm"
178
+ mode="custom_llm",
199
179
  )
200
-
180
+
201
181
  # If can't parse as JSON, use default values
202
182
  return EvaluationResult(
203
183
  score=0.5,
204
184
  reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
205
- mode="custom_llm"
185
+ mode="custom_llm",
206
186
  )
207
-
187
+
208
188
  except Exception as e:
209
- return EvaluationResult(
210
- score=0.0,
211
- reason=f"LLM evaluation error: {e!s}",
212
- mode="custom_llm"
213
- )
189
+ return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
hud/evaluators/match.py CHANGED
@@ -2,20 +2,27 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from difflib import SequenceMatcher
5
- from typing import Any
5
+ from typing import TYPE_CHECKING, Protocol
6
6
 
7
7
  from textdistance import levenshtein
8
8
 
9
9
  from hud.evaluators.base import EvaluationResult
10
10
 
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Sequence
11
13
 
12
- def match_single(response: Any, answer: Any) -> EvaluationResult:
14
+
15
+ class _Stringable(Protocol):
16
+ def __str__(self) -> str: ...
17
+
18
+
19
+ def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
13
20
  """Check if the answer is present within the response.
14
-
21
+
15
22
  Args:
16
23
  response: The response to evaluate
17
24
  answer: The expected answer
18
-
25
+
19
26
  Returns:
20
27
  EvaluationResult with score=1.0 if match, 0.0 otherwise
21
28
  """
@@ -23,54 +30,50 @@ def match_single(response: Any, answer: Any) -> EvaluationResult:
23
30
  return EvaluationResult(
24
31
  score=1.0 if passed else 0.0,
25
32
  reason="Exact match" if passed else "No exact match found",
26
- mode="single"
33
+ mode="single",
27
34
  )
28
35
 
29
36
 
30
- def match_all(response: Any, answers: list) -> EvaluationResult:
37
+ def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
31
38
  """Count how many expected answers are in the response.
32
-
39
+
33
40
  Args:
34
41
  response: The response to evaluate
35
42
  answers: List of expected answers
36
-
43
+
37
44
  Returns:
38
45
  EvaluationResult with score=proportion of matches (0.0-1.0)
39
46
  """
40
47
  response_str = str(response).lower()
41
48
  matches = 0
42
-
49
+
43
50
  for answer in answers:
44
51
  if str(answer).lower() in response_str:
45
52
  matches += 1
46
-
53
+
47
54
  score = matches / len(answers) if answers else 0.0
48
-
55
+
49
56
  if matches == len(answers):
50
57
  reason = f"All {matches} expected items found"
51
58
  else:
52
59
  reason = f"Only {matches} of {len(answers)} expected items found"
53
-
54
- return EvaluationResult(
55
- score=score,
56
- reason=reason,
57
- mode="all"
58
- )
59
60
 
61
+ return EvaluationResult(score=score, reason=reason, mode="all")
60
62
 
61
- def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
63
+
64
+ def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
62
65
  """Calculate similarity using Levenshtein distance.
63
-
66
+
64
67
  Args:
65
68
  response: The response to evaluate
66
69
  answer: The expected answer
67
-
70
+
68
71
  Returns:
69
72
  EvaluationResult with score=similarity (0.0-1.0)
70
73
  """
71
74
  s1 = str(response).lower()
72
75
  s2 = str(answer).lower()
73
-
76
+
74
77
  if s1 == s2:
75
78
  score = 1.0
76
79
  elif len(s1) == 0 or len(s2) == 0:
@@ -80,21 +83,19 @@ def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
80
83
  distance = levenshtein.distance(s1, s2)
81
84
  max_len = max(len(s1), len(s2))
82
85
  score = 1.0 - (distance / max_len)
83
-
86
+
84
87
  return EvaluationResult(
85
- score=score,
86
- reason=f"Fuzzy match with {score:.1%} similarity",
87
- mode="fuzz"
88
+ score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
88
89
  )
89
90
 
90
91
 
91
- def match_regex(response: Any, pattern: str) -> EvaluationResult:
92
+ def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
92
93
  """Check if response matches regex pattern.
93
-
94
+
94
95
  Args:
95
96
  response: The response to evaluate
96
97
  pattern: Regular expression pattern to match
97
-
98
+
98
99
  Returns:
99
100
  EvaluationResult with score=1.0 if match, 0.0 otherwise
100
101
  """
@@ -104,23 +105,19 @@ def match_regex(response: Any, pattern: str) -> EvaluationResult:
104
105
  return EvaluationResult(
105
106
  score=1.0 if passed else 0.0,
106
107
  reason="Regex pattern matched" if passed else "Regex pattern did not match",
107
- mode="regex"
108
+ mode="regex",
108
109
  )
109
110
  except re.error:
110
- return EvaluationResult(
111
- score=0.0,
112
- reason="Invalid regex pattern",
113
- mode="regex"
114
- )
111
+ return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
115
112
 
116
113
 
117
- def match_diff(response: Any, answer: Any) -> EvaluationResult:
114
+ def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
118
115
  """Compare difference between response and answer.
119
-
116
+
120
117
  Args:
121
118
  response: The response to evaluate
122
119
  answer: The expected answer
123
-
120
+
124
121
  Returns:
125
122
  EvaluationResult with score=similarity (0.0-1.0)
126
123
  """
@@ -130,34 +127,30 @@ def match_diff(response: Any, answer: Any) -> EvaluationResult:
130
127
  else:
131
128
  score = _match_string_diff(response, answer)
132
129
  reason = f"String difference with {score:.1%} similarity"
133
-
134
- return EvaluationResult(
135
- score=score,
136
- reason=reason,
137
- mode="diff"
138
- )
130
+
131
+ return EvaluationResult(score=score, reason=reason, mode="diff")
139
132
 
140
133
 
141
- def _match_string_diff(response: Any, answer: Any) -> float:
134
+ def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
142
135
  """Compare difference between response and answer strings."""
143
136
  matcher = SequenceMatcher(None, str(response), str(answer))
144
137
  return matcher.ratio()
145
-
138
+
146
139
 
147
140
  def _match_numeric_diff(response: float, answer: float) -> float:
148
141
  """Calculate normalized difference between numeric values.
149
-
142
+
150
143
  Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
151
144
  """
152
145
  if response == answer:
153
146
  return 1.0
154
-
147
+
155
148
  # Simple absolute difference normalized to a 0-1 scale
156
149
  diff = abs(response - answer)
157
150
  max_val = max(abs(response), abs(answer))
158
-
151
+
159
152
  if max_val == 0:
160
153
  return 1.0 # Both are zero
161
-
154
+
162
155
  # Normalize and invert so 1.0 means identical
163
156
  return max(0.0, 1.0 - min(1.0, diff / max_val))
hud/evaluators/remote.py CHANGED
@@ -9,19 +9,16 @@ from hud.settings import settings
9
9
 
10
10
 
11
11
  async def _remote_eval_call(
12
- response: Any,
13
- answer: Any,
14
- eval_type: str,
15
- config: dict[str, Any] | None = None
12
+ response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
16
13
  ) -> dict[str, Any]:
17
14
  """Send an evaluation request to the remote server.
18
-
15
+
19
16
  Args:
20
17
  response: The response to evaluate
21
18
  answer: The reference answer to compare against
22
19
  eval_type: Type of evaluation (e.g., "match", "judge", "agent")
23
20
  config: Optional configuration parameters
24
-
21
+
25
22
  Returns:
26
23
  Dictionary with evaluation results from the server
27
24
  """
@@ -33,46 +30,36 @@ async def _remote_eval_call(
33
30
  "response": response,
34
31
  "answer": answer,
35
32
  "type": eval_type,
36
- "config": config or {}
33
+ "config": config or {},
37
34
  },
38
35
  api_key=settings.api_key,
39
36
  )
40
37
  return result
41
38
  except Exception as e:
42
- return {
43
- "score": -1.0,
44
- "reason": f"Remote evaluation failed: {e!s}",
45
- "details": {}
46
- }
39
+ return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
47
40
 
48
41
 
49
42
  def remote_evaluate(
50
- response: Any,
51
- answer: Any,
52
- eval_type: str = "default",
53
- config: dict[str, Any] | None = None
43
+ response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
54
44
  ) -> EvaluationResult:
55
45
  """Evaluate a response using remote evaluation services.
56
-
46
+
57
47
  Args:
58
48
  response: The response to evaluate
59
49
  answer: The reference answer to compare against
60
50
  eval_type: Type of evaluation to perform
61
51
  config: Optional configuration for the evaluation
62
-
52
+
63
53
  Returns:
64
54
  EvaluationResult containing the evaluation results
65
55
  """
66
- result = asyncio.run(_remote_eval_call(
67
- response=response,
68
- answer=answer,
69
- eval_type=eval_type,
70
- config=config
71
- ))
72
-
56
+ result = asyncio.run(
57
+ _remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
58
+ )
59
+
73
60
  return EvaluationResult(
74
61
  score=result.get("score", -1.0),
75
62
  reason=result.get("reason", "Remote evaluation completed"),
76
63
  mode=eval_type,
77
- criteria_scores=result.get("details", {})
64
+ criteria_scores=result.get("details", {}),
78
65
  )
File without changes
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from hud.evaluators.inspect import inspect_evaluate
4
+
5
+
6
+ def test_inspect_evaluate_basic():
7
+ """Test basic functionality of inspect_evaluate."""
8
+ result = inspect_evaluate("Test response", "Test answer")
9
+
10
+ assert result.score == 0.0
11
+ assert result.reason == "Inspect evaluation not implemented"
12
+ assert result.mode == "inspect"