hud-python 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (54) hide show
  1. hud/__init__.py +7 -4
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +14 -2
  6. hud/env/local_docker_client.py +28 -6
  7. hud/gym.py +0 -9
  8. hud/{mcp_agent → mcp}/__init__.py +2 -0
  9. hud/mcp/base.py +631 -0
  10. hud/{mcp_agent → mcp}/claude.py +52 -47
  11. hud/mcp/client.py +312 -0
  12. hud/{mcp_agent → mcp}/langchain.py +52 -33
  13. hud/{mcp_agent → mcp}/openai.py +56 -40
  14. hud/{mcp_agent → mcp}/tests/test_base.py +129 -54
  15. hud/mcp/tests/test_claude.py +294 -0
  16. hud/mcp/tests/test_client.py +324 -0
  17. hud/mcp/tests/test_openai.py +238 -0
  18. hud/settings.py +6 -0
  19. hud/task.py +2 -88
  20. hud/taskset.py +2 -23
  21. hud/telemetry/__init__.py +5 -0
  22. hud/telemetry/_trace.py +180 -17
  23. hud/telemetry/context.py +79 -0
  24. hud/telemetry/exporter.py +165 -6
  25. hud/telemetry/job.py +141 -0
  26. hud/telemetry/tests/test_trace.py +36 -25
  27. hud/tools/__init__.py +14 -1
  28. hud/tools/computer/hud.py +13 -0
  29. hud/tools/executors/__init__.py +19 -2
  30. hud/tools/executors/pyautogui.py +84 -50
  31. hud/tools/executors/tests/test_pyautogui_executor.py +4 -1
  32. hud/tools/playwright_tool.py +73 -67
  33. hud/tools/tests/test_edit.py +8 -1
  34. hud/tools/tests/test_tools.py +3 -0
  35. hud/trajectory.py +5 -1
  36. hud/utils/tests/test_version.py +1 -1
  37. hud/version.py +1 -1
  38. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/METADATA +20 -14
  39. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/RECORD +42 -47
  40. hud/evaluators/__init__.py +0 -9
  41. hud/evaluators/base.py +0 -32
  42. hud/evaluators/inspect.py +0 -24
  43. hud/evaluators/judge.py +0 -189
  44. hud/evaluators/match.py +0 -156
  45. hud/evaluators/remote.py +0 -65
  46. hud/evaluators/tests/__init__.py +0 -0
  47. hud/evaluators/tests/test_inspect.py +0 -12
  48. hud/evaluators/tests/test_judge.py +0 -231
  49. hud/evaluators/tests/test_match.py +0 -115
  50. hud/evaluators/tests/test_remote.py +0 -98
  51. hud/mcp_agent/base.py +0 -723
  52. /hud/{mcp_agent → mcp}/tests/__init__.py +0 -0
  53. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/WHEEL +0 -0
  54. {hud_python-0.3.0.dist-info → hud_python-0.3.2.dist-info}/licenses/LICENSE +0 -0
hud/evaluators/remote.py DELETED
@@ -1,65 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- from typing import Any
5
-
6
- from hud.evaluators.base import EvaluationResult
7
- from hud.server import make_request
8
- from hud.settings import settings
9
-
10
-
11
- async def _remote_eval_call(
12
- response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
13
- ) -> dict[str, Any]:
14
- """Send an evaluation request to the remote server.
15
-
16
- Args:
17
- response: The response to evaluate
18
- answer: The reference answer to compare against
19
- eval_type: Type of evaluation (e.g., "match", "judge", "agent")
20
- config: Optional configuration parameters
21
-
22
- Returns:
23
- Dictionary with evaluation results from the server
24
- """
25
- try:
26
- result = await make_request(
27
- method="POST",
28
- url=f"{settings.base_url}/evaluations/evaluate",
29
- json={
30
- "response": response,
31
- "answer": answer,
32
- "type": eval_type,
33
- "config": config or {},
34
- },
35
- api_key=settings.api_key,
36
- )
37
- return result
38
- except Exception as e:
39
- return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
40
-
41
-
42
- def remote_evaluate(
43
- response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
44
- ) -> EvaluationResult:
45
- """Evaluate a response using remote evaluation services.
46
-
47
- Args:
48
- response: The response to evaluate
49
- answer: The reference answer to compare against
50
- eval_type: Type of evaluation to perform
51
- config: Optional configuration for the evaluation
52
-
53
- Returns:
54
- EvaluationResult containing the evaluation results
55
- """
56
- result = asyncio.run(
57
- _remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
58
- )
59
-
60
- return EvaluationResult(
61
- score=result.get("score", -1.0),
62
- reason=result.get("reason", "Remote evaluation completed"),
63
- mode=eval_type,
64
- criteria_scores=result.get("details", {}),
65
- )
File without changes
@@ -1,12 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from hud.evaluators.inspect import inspect_evaluate
4
-
5
-
6
- def test_inspect_evaluate_basic():
7
- """Test basic functionality of inspect_evaluate."""
8
- result = inspect_evaluate("Test response", "Test answer")
9
-
10
- assert result.score == 0.0
11
- assert result.reason == "Inspect evaluation not implemented"
12
- assert result.mode == "inspect"
@@ -1,231 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import base64
4
-
5
- import pytest
6
-
7
- from hud.evaluators.base import EvaluationResult
8
- from hud.evaluators.judge import (
9
- _call_eval_endpoint,
10
- _evaluate_with_llm,
11
- _is_base64_image,
12
- _process_input,
13
- judge,
14
- )
15
-
16
-
17
- class _MockLLM:
18
- """Mock LLM for testing."""
19
-
20
- def __init__(self, response_text):
21
- self.response_text = response_text
22
-
23
- async def ainvoke(self, _prompt: str) -> str:
24
- return self.response_text
25
-
26
-
27
- @pytest.mark.parametrize(
28
- "input_data, expected_result",
29
- [
30
- ("Hello world", "Hello world"),
31
- (123, "123"),
32
- (["Hello", "world"], ["Hello", "world"]),
33
- ({"key": "value"}, {"key": "value"}),
34
- (b"Hello world", base64.b64encode(b"Hello world").decode("utf-8")),
35
- ],
36
- )
37
- def test_process_input(input_data, expected_result):
38
- """Test processing various input types."""
39
- result = _process_input(input_data)
40
- assert result == expected_result
41
-
42
-
43
- @pytest.mark.parametrize(
44
- "input_data, expected_result",
45
- [
46
- ("not an image", False),
47
- ("", True),
48
- (b"not an image", False),
49
- (123, False),
50
- ],
51
- )
52
- def test_is_base64_image(input_data, expected_result):
53
- """Test base64 image detection."""
54
- assert _is_base64_image(input_data) == expected_result
55
-
56
-
57
- def test_is_base64_image_with_signatures(mocker):
58
- """Test base64 image detection with common image signatures."""
59
- # Mock base64.b64decode to return different image signatures
60
- mock_b64decode = mocker.patch("base64.b64decode")
61
-
62
- # Test JPEG signature
63
- mock_b64decode.return_value = b"\xff\xd8\xff" + b"some data"
64
- assert _is_base64_image("not_really_base64_but_mocked") is True
65
-
66
- # Test PNG signature
67
- mock_b64decode.return_value = b"\x89PNG\r\n\x1a\n" + b"some data"
68
- assert _is_base64_image("not_really_base64_but_mocked") is True
69
-
70
- # Test GIF signature
71
- mock_b64decode.return_value = b"GIF8" + b"some data"
72
- assert _is_base64_image("not_really_base64_but_mocked") is True
73
-
74
- # Test RIFF signature (WebP)
75
- mock_b64decode.return_value = b"RIFF" + b"some data"
76
- assert _is_base64_image("not_really_base64_but_mocked") is True
77
-
78
-
79
- @pytest.mark.asyncio
80
- async def test_call_eval_endpoint_success(mocker):
81
- """Test successful remote evaluation call."""
82
- mock_response = {
83
- "score": 0.8,
84
- "reason": "Good response",
85
- "criteria_scores": {"relevance": 0.9, "accuracy": 0.7},
86
- }
87
- mock_make_request = mocker.patch(
88
- "hud.evaluators.judge.make_request", return_value=mock_response
89
- )
90
- result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
91
- assert result == mock_response
92
- mock_make_request.assert_called_once()
93
-
94
-
95
- @pytest.mark.asyncio
96
- async def test_call_eval_endpoint_failure(mocker):
97
- """Test remote evaluation call failure."""
98
- mocker.patch("hud.evaluators.judge.make_request", side_effect=Exception("API error"))
99
- result = await _call_eval_endpoint("test response", "test answer", [], "LLM")
100
- assert result["score"] == -1.0
101
- assert "Remote evaluation failed" in result["reason"]
102
- assert result["criteria_scores"] == {}
103
-
104
-
105
- def test_judge_without_llm(mocker):
106
- """Test judge function without custom LLM."""
107
- mock_result = {
108
- "score": 0.9,
109
- "reason": "Good answer",
110
- "criteria_scores": {"relevance": 1.0},
111
- }
112
-
113
- async def mock_endpoint(*args, **kwargs):
114
- return mock_result
115
-
116
- mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
117
- result = judge("test response", "test answer")
118
-
119
- assert result.score == 0.9
120
- assert result.reason == "Good answer"
121
- assert result.mode == "LLM"
122
- assert result.criteria_scores == {"relevance": 1.0}
123
-
124
-
125
- def test_judge_with_image_answer(mocker):
126
- """Test judge function with an image as the answer."""
127
- mock_result = {
128
- "score": 0.85,
129
- "reason": "Good image analysis",
130
- "criteria_scores": {"visual_accuracy": 0.85},
131
- }
132
-
133
- async def mock_endpoint(*args, **kwargs):
134
- return mock_result
135
-
136
- mocker.patch("hud.evaluators.judge._call_eval_endpoint", mock_endpoint)
137
-
138
- # Create a mock image
139
- image_data = b"fake_image_data"
140
- base64_image = base64.b64encode(image_data).decode("utf-8")
141
- image_uri = f"data:image/jpeg;base64,{base64_image}"
142
-
143
- result = judge("description of image", image_uri)
144
-
145
- assert result.score == 0.85
146
- assert result.reason == "Good image analysis"
147
- assert result.mode == "VLM" # Should use VLM mode for images
148
- assert result.criteria_scores == {"visual_accuracy": 0.85}
149
-
150
-
151
- def test_judge_with_llm(mocker):
152
- """Test judge function with custom LLM."""
153
- mock_llm = _MockLLM('{"score": 0.75, "reason": "Pretty good"}')
154
- mock_result = EvaluationResult(score=0.75, reason="Pretty good", mode="custom_llm")
155
- mocker.patch("hud.evaluators.judge._evaluate_with_llm", return_value=mock_result)
156
- result = judge("test response", "test answer", llm=mock_llm)
157
- assert result.score == 0.75
158
- assert result.reason == "Pretty good"
159
- assert result.mode == "custom_llm"
160
-
161
-
162
- def test_evaluate_with_llm_valid_json():
163
- """Test _evaluate_with_llm with valid JSON response."""
164
- llm = _MockLLM('{"score": 0.85, "reason": "The response is accurate and well-structured."}')
165
- result = _evaluate_with_llm("test response", "test answer", llm)
166
-
167
- assert result.score == 0.85
168
- assert result.reason == "The response is accurate and well-structured."
169
- assert result.mode == "custom_llm"
170
-
171
-
172
- def test_evaluate_with_llm_json_in_text():
173
- """Test _evaluate_with_llm with JSON embedded in text."""
174
- llm_response = """
175
- I've evaluated the response and here's my assessment:
176
-
177
- {"score": 0.7, "reason": "Good but could be more detailed"}
178
-
179
- I hope this helps!
180
- """
181
- llm = _MockLLM(llm_response)
182
- result = _evaluate_with_llm("test response", "test answer", llm)
183
-
184
- assert result.score == 0.7
185
- assert result.reason == "Good but could be more detailed"
186
- assert result.mode == "custom_llm"
187
-
188
-
189
- def test_evaluate_with_llm_invalid_json():
190
- """Test _evaluate_with_llm with invalid JSON response."""
191
- llm = _MockLLM("This is not a JSON response")
192
- result = _evaluate_with_llm("test response", "test answer", llm)
193
-
194
- assert result.score == 0.5 # Default score for unparseable responses
195
- assert "Unable to parse LLM response as JSON" in result.reason
196
- assert result.mode == "custom_llm"
197
-
198
-
199
- def test_evaluate_with_llm_exception(mocker):
200
- """Test _evaluate_with_llm when an exception occurs."""
201
- # Mock the LLM to raise an exception
202
- failing_llm = _MockLLM("doesn't matter")
203
- mocker.patch.object(failing_llm, "ainvoke", side_effect=Exception("LLM API error"))
204
-
205
- result = _evaluate_with_llm("test response", "test answer", failing_llm)
206
-
207
- assert result.score == 0.0 # Zero score for errors
208
- assert "LLM evaluation error: LLM API error" in result.reason
209
- assert result.mode == "custom_llm"
210
-
211
-
212
- def test_evaluate_with_llm_with_criteria():
213
- """Test _evaluate_with_llm with evaluation criteria."""
214
- llm = _MockLLM('{"score": 0.9, "reason": "Excellent match on all criteria"}')
215
-
216
- # Test with string criteria
217
- string_criteria = ["Accuracy", "Relevance", "Completeness"]
218
- result = _evaluate_with_llm("test response", "test answer", llm, criteria=string_criteria)
219
-
220
- assert result.score == 0.9
221
- assert result.reason == "Excellent match on all criteria"
222
-
223
- # Test with dict criteria
224
- dict_criteria = [
225
- {"description": "Factual accuracy", "weight": 0.6},
226
- {"description": "Grammar and spelling", "weight": 0.4},
227
- ]
228
- result = _evaluate_with_llm("test response", "test answer", llm, criteria=dict_criteria)
229
-
230
- assert result.score == 0.9
231
- assert result.reason == "Excellent match on all criteria"
@@ -1,115 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import pytest
4
-
5
- from hud.evaluators.match import match_all, match_diff, match_fuzzy, match_regex, match_single
6
-
7
-
8
- @pytest.mark.parametrize(
9
- "response, answer, expected_score, expected_reason, expected_mode",
10
- [
11
- ("Hello, world!", "world", 1.0, "Exact match", "single"),
12
- ("Hello, world!", "not world", 0.0, "No exact match found", "single"),
13
- ],
14
- )
15
- def test_match_single(
16
- response: str,
17
- answer: str,
18
- expected_score: float,
19
- expected_reason: str,
20
- expected_mode: str,
21
- ):
22
- result = match_single(response, answer)
23
- assert result.score == expected_score
24
- assert result.reason == expected_reason
25
- assert result.mode == expected_mode
26
-
27
-
28
- @pytest.mark.parametrize(
29
- "response, answers, expected_score, expected_reason, expected_mode",
30
- [
31
- ("Hello, world!", ["world", "hello"], 1.0, "All 2 expected items found", "all"),
32
- ("Hello, world!", ["world", "not hello"], 0.5, "Only 1 of 2 expected items found", "all"),
33
- ],
34
- )
35
- def test_match_all(
36
- response: str,
37
- answers: list[str],
38
- expected_score: float,
39
- expected_reason: str,
40
- expected_mode: str,
41
- ):
42
- result = match_all(response, answers)
43
- assert result.score == expected_score
44
- assert result.reason == expected_reason
45
- assert result.mode == expected_mode
46
-
47
-
48
- @pytest.mark.parametrize(
49
- "response, answer, expected_score, expected_reason, expected_mode",
50
- [
51
- ("hello world", "hello world", 1.0, "Fuzzy match with 100.0% similarity", "fuzz"),
52
- ("hello wrld", "hello world", 0.9, "Fuzzy match with 90.9% similarity", "fuzz"),
53
- ("hello", "hello world", 0.45, "Fuzzy match with 45.5% similarity", "fuzz"),
54
- ("", "hello world", 0.0, "Fuzzy match with 0.0% similarity", "fuzz"),
55
- ],
56
- )
57
- def test_match_fuzzy(
58
- response: str,
59
- answer: str,
60
- expected_score: float,
61
- expected_reason: str,
62
- expected_mode: str,
63
- ):
64
- result = match_fuzzy(response, answer)
65
- assert result.score == pytest.approx(expected_score, abs=1e-2)
66
- assert result.reason == expected_reason
67
- assert result.mode == expected_mode
68
-
69
-
70
- @pytest.mark.parametrize(
71
- "response, pattern, expected_score, expected_reason, expected_mode",
72
- [
73
- ("hello world", r"hello.*", 1.0, "Regex pattern matched", "regex"),
74
- ("hello world", r"^hello.*$", 1.0, "Regex pattern matched", "regex"),
75
- ("hello world", r"world$", 1.0, "Regex pattern matched", "regex"),
76
- ("hello world", r"^goodbye.*$", 0.0, "Regex pattern did not match", "regex"),
77
- ("hello world", r"[invalid[", 0.0, "Invalid regex pattern", "regex"),
78
- ],
79
- )
80
- def test_match_regex(
81
- response: str,
82
- pattern: str,
83
- expected_score: float,
84
- expected_reason: str,
85
- expected_mode: str,
86
- ):
87
- result = match_regex(response, pattern)
88
- assert result.score == expected_score
89
- assert result.reason == expected_reason
90
- assert result.mode == expected_mode
91
-
92
-
93
- @pytest.mark.parametrize(
94
- "response, answer, expected_score, expected_reason, expected_mode",
95
- [
96
- ("hello world", "hello world", 1.0, "String difference with 100.0% similarity", "diff"),
97
- ("hello", "hello world", 0.625, "String difference with 62.5% similarity", "diff"),
98
- ("", "hello world", 0.0, "String difference with 0.0% similarity", "diff"),
99
- (100, 100, 1.0, "Numeric difference: 0", "diff"),
100
- (90, 100, 0.9, "Numeric difference: 10", "diff"),
101
- (0, 100, 0.0, "Numeric difference: 100", "diff"),
102
- (-100, 100, 0.0, "Numeric difference: 200", "diff"),
103
- ],
104
- )
105
- def test_match_diff(
106
- response: str | int | float,
107
- answer: str | int | float,
108
- expected_score: float,
109
- expected_reason: str,
110
- expected_mode: str,
111
- ):
112
- result = match_diff(response, answer)
113
- assert result.score == pytest.approx(expected_score, abs=1e-2)
114
- assert result.reason == expected_reason
115
- assert result.mode == expected_mode
@@ -1,98 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import pytest
4
-
5
- from hud.evaluators.base import EvaluationResult
6
- from hud.evaluators.remote import _remote_eval_call, remote_evaluate
7
-
8
-
9
- @pytest.mark.asyncio
10
- async def test_remote_eval_call_success(mocker):
11
- mock_response = {
12
- "score": 0.85,
13
- "reason": "Good match",
14
- "details": {"relevance": 0.9, "correctness": 0.8},
15
- }
16
- mock_make_request = mocker.patch(
17
- "hud.evaluators.remote.make_request", return_value=mock_response
18
- )
19
-
20
- result = await _remote_eval_call(
21
- response="test response", answer="test answer", eval_type="match"
22
- )
23
-
24
- assert result == mock_response
25
- mock_make_request.assert_called_once()
26
- call_args = mock_make_request.call_args[1]
27
- assert call_args["method"] == "POST"
28
- assert "evaluations/evaluate" in call_args["url"]
29
- assert call_args["json"]["response"] == "test response"
30
- assert call_args["json"]["answer"] == "test answer"
31
- assert call_args["json"]["type"] == "match"
32
-
33
-
34
- @pytest.mark.asyncio
35
- async def test_remote_eval_call_with_config(mocker):
36
- mock_response = {"score": 0.75, "reason": "Good", "details": {}}
37
- mock_make_request = mocker.patch(
38
- "hud.evaluators.remote.make_request", return_value=mock_response
39
- )
40
-
41
- config = {"threshold": 0.8, "strict": True}
42
- result = await _remote_eval_call(
43
- response="test response", answer="test answer", eval_type="judge", config=config
44
- )
45
-
46
- assert result == mock_response
47
- mock_make_request.assert_called_once()
48
- call_args = mock_make_request.call_args[1]
49
- assert call_args["json"]["config"] == config
50
-
51
-
52
- @pytest.mark.asyncio
53
- async def test_remote_eval_call_failure(mocker):
54
- mocker.patch("hud.evaluators.remote.make_request", side_effect=Exception("API error"))
55
-
56
- result = await _remote_eval_call(
57
- response="test response", answer="test answer", eval_type="match"
58
- )
59
-
60
- assert result["score"] == -1.0
61
- assert "Remote evaluation failed" in result["reason"]
62
- assert "API error" in result["reason"]
63
- assert result["details"] == {}
64
-
65
-
66
- def test_remote_evaluate(mocker):
67
- mock_result = {"score": 0.9, "reason": "Excellent match", "details": {"similarity": 0.95}}
68
-
69
- async def mock_remote_call(*args, **kwargs):
70
- return mock_result
71
-
72
- mocker.patch("hud.evaluators.remote._remote_eval_call", side_effect=mock_remote_call)
73
-
74
- result = remote_evaluate(
75
- response="test response", answer="test answer", eval_type="custom_eval"
76
- )
77
-
78
- assert isinstance(result, EvaluationResult)
79
- assert result.score == 0.9
80
- assert result.reason == "Excellent match"
81
- assert result.mode == "custom_eval"
82
- assert result.criteria_scores == {"similarity": 0.95}
83
-
84
-
85
- def test_remote_evaluate_missing_fields(mocker):
86
- mock_result = {"score": 0.8} # Missing reason and details
87
-
88
- async def mock_remote_call(*args, **kwargs):
89
- return mock_result
90
-
91
- mocker.patch("hud.evaluators.remote._remote_eval_call", side_effect=mock_remote_call)
92
-
93
- result = remote_evaluate(response="test response", answer="test answer")
94
-
95
- assert result.score == 0.8
96
- assert result.reason == "Remote evaluation completed"
97
- assert result.mode == "default"
98
- assert result.criteria_scores == {}