hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,9 +0,0 @@
1
- """
2
- Evaluators for assessing task responses.
3
- """
4
-
5
- from __future__ import annotations
6
-
7
- from hud.evaluators.base import Evaluator
8
-
9
- __all__ = ["Evaluator"]
hud/evaluators/base.py DELETED
@@ -1,32 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import TYPE_CHECKING
5
-
6
- from pydantic import BaseModel, Field
7
-
8
- if TYPE_CHECKING:
9
- from hud.task import Task
10
-
11
-
12
- class EvaluationResult(BaseModel):
13
- """Result of an evaluation.
14
-
15
- Attributes:
16
- score: Float score between 0 and 1
17
- reason: Explanation of the evaluation
18
- mode: Mode used for matching, if applicable
19
- """
20
-
21
- score: float
22
- reason: str
23
- mode: str | None = None
24
- criteria_scores: dict[str, float] | None = Field(default_factory=dict)
25
-
26
-
27
- class Evaluator(ABC):
28
- """Abstract base class for evaluators."""
29
-
30
- @abstractmethod
31
- def evaluate(self, task: Task, response: str) -> EvaluationResult:
32
- """Evaluate a task and response."""
hud/evaluators/inspect.py DELETED
@@ -1,24 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any
4
-
5
- from hud.evaluators.base import EvaluationResult
6
-
7
-
8
- def inspect_evaluate(
9
- response: Any,
10
- answer: Any,
11
- ) -> EvaluationResult:
12
- """Evaluate using Inspect-ai's evaluation models.
13
-
14
- Args:
15
- response: The response to evaluate
16
- answer: The reference answer to compare against
17
- model_name: The Inspect model to use
18
- prompt: Optional custom prompt for evaluation
19
- metrics: Optional list of metrics to evaluate against
20
-
21
- Returns:
22
- EvaluationResult with the evaluation results
23
- """
24
- return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")
hud/evaluators/judge.py DELETED
@@ -1,189 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import base64
5
- from typing import Any, Protocol, TypedDict
6
-
7
- from hud.evaluators.base import EvaluationResult
8
- from hud.server import make_request
9
- from hud.settings import settings
10
-
11
-
12
- class LLM(Protocol):
13
- """Protocol for LLM interfaces that can be used for evaluation."""
14
-
15
- async def ainvoke(self, prompt: str, /) -> str: ...
16
-
17
-
18
- class Criterion(TypedDict, total=False):
19
- """Criterion for judge-based evaluation."""
20
-
21
- description: str
22
- weight: float
23
-
24
-
25
- async def _call_eval_endpoint(
26
- response: Any, answer: Any, criteria: list[Any], mode: str
27
- ) -> dict[str, Any]:
28
- """Call the run_eval endpoint to evaluate the response."""
29
- try:
30
- result = await make_request(
31
- method="POST",
32
- url=f"{settings.base_url}/evaluations/run_eval",
33
- json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
34
- api_key=settings.api_key,
35
- )
36
- return result
37
- except Exception as e:
38
- # Fallback to local evaluation if remote call fails
39
- return {
40
- "score": -1.0,
41
- "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
42
- "criteria_scores": {},
43
- }
44
-
45
-
46
- def _process_input(data: Any) -> Any:
47
- """Process input data, detecting and handling base64 images."""
48
- if isinstance(data, bytes):
49
- # Convert bytes to base64 string
50
- return base64.b64encode(data).decode("utf-8")
51
-
52
- if isinstance(data, str) and _is_base64_image(data):
53
- # It's already a base64 string, just return it
54
- return data
55
-
56
- if isinstance(data, list) and all(isinstance(item, str) for item in data):
57
- # Process list of strings
58
- return data
59
-
60
- # For other types, convert to string
61
- return str(data) if not isinstance(data, str | dict) else data
62
-
63
-
64
- def _is_base64_image(data: Any) -> bool:
65
- """Check if a string is a base64 encoded image."""
66
- if not isinstance(data, str):
67
- return False
68
-
69
- # Check for common image data URI pattern
70
- if data.startswith(("data:image/", "data:application/octet-stream")):
71
- return True
72
-
73
- # Check if it's a base64 encoded string with image header
74
- try:
75
- # First, validate it's base64 decodable
76
- padding_needed = len(data) % 4
77
- if padding_needed:
78
- data += "=" * (4 - padding_needed)
79
-
80
- # Try to decode the first few bytes to check for image signatures
81
- sample = base64.b64decode(data[:30])
82
-
83
- # Check for common image format signatures
84
- return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
85
- except Exception:
86
- return False
87
-
88
-
89
- def judge(
90
- response: Any,
91
- answer: Any,
92
- llm: LLM | None = None,
93
- criteria: list[str] | list[dict] | None = None,
94
- ) -> EvaluationResult:
95
- """Judge a response against an answer using an LLM.
96
-
97
- Args:
98
- response: The response to evaluate
99
- answer: The reference answer to compare against
100
- llm: Optional langchain LLM to use for evaluation
101
- criteria: Evaluation criteria as strings or dictionaries
102
-
103
- Returns:
104
- EvaluationResult with evaluation results
105
- """
106
- # Process inputs
107
- processed_response = _process_input(response)
108
- processed_answer = _process_input(answer)
109
-
110
- # If LLM is provided, use it for evaluation
111
- if llm:
112
- return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
113
-
114
- # Otherwise, use the remote evaluation service
115
- mode = "LLM"
116
- if isinstance(answer, bytes) or _is_base64_image(answer):
117
- mode = "VLM"
118
-
119
- # Call the eval endpoint synchronously
120
- result = asyncio.run(
121
- _call_eval_endpoint(
122
- response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
123
- )
124
- )
125
-
126
- return EvaluationResult(
127
- score=result.get("score", -1.0),
128
- reason=result.get("reason", "Response evaluated"),
129
- mode=mode,
130
- criteria_scores=result.get("criteria_scores", {}),
131
- )
132
-
133
-
134
- def _evaluate_with_llm(
135
- response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
136
- ) -> EvaluationResult:
137
- """Evaluate a response against an answer using a provided LLM."""
138
- criteria_text = ""
139
- if criteria:
140
- criteria_text = "Use the following criteria:\n"
141
- for c in criteria:
142
- if isinstance(c, dict) and "description" in c:
143
- criteria_text += f"- {c['description']}\n"
144
- elif isinstance(c, str):
145
- criteria_text += f"- {c}\n"
146
-
147
- prompt = f"""Evaluate the quality of a response given a reference answer.
148
-
149
- REFERENCE ANSWER:
150
- {answer}
151
-
152
- RESPONSE TO EVALUATE:
153
- {response}
154
-
155
- {criteria_text}
156
- Rate the response on a scale from 0.0 to 1.0, where 1.0 is perfect.
157
- Provide a brief explanation for your rating.
158
- Format your answer as a JSON object with 'score' (float) and 'reason' (string) fields.
159
- """
160
-
161
- try:
162
- # Run the evaluation asynchronously
163
- result_text = asyncio.run(llm.ainvoke(prompt))
164
-
165
- # Attempt to parse JSON response
166
- import json
167
- import re
168
-
169
- # Try to extract JSON if wrapped in other text
170
- json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
171
- if json_match:
172
- json_str = json_match.group(0)
173
- result = json.loads(json_str)
174
-
175
- return EvaluationResult(
176
- score=float(result.get("score", 0.5)),
177
- reason=result.get("reason", "Evaluated with custom LLM"),
178
- mode="custom_llm",
179
- )
180
-
181
- # If can't parse as JSON, use default values
182
- return EvaluationResult(
183
- score=0.5,
184
- reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
185
- mode="custom_llm",
186
- )
187
-
188
- except Exception as e:
189
- return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")
hud/evaluators/match.py DELETED
@@ -1,156 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import re
4
- from difflib import SequenceMatcher
5
- from typing import TYPE_CHECKING, Protocol
6
-
7
- from textdistance import levenshtein
8
-
9
- from hud.evaluators.base import EvaluationResult
10
-
11
- if TYPE_CHECKING:
12
- from collections.abc import Sequence
13
-
14
-
15
- class _Stringable(Protocol):
16
- def __str__(self) -> str: ...
17
-
18
-
19
- def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
20
- """Check if the answer is present within the response.
21
-
22
- Args:
23
- response: The response to evaluate
24
- answer: The expected answer
25
-
26
- Returns:
27
- EvaluationResult with score=1.0 if match, 0.0 otherwise
28
- """
29
- passed = str(answer).lower().strip() in str(response).lower().strip()
30
- return EvaluationResult(
31
- score=1.0 if passed else 0.0,
32
- reason="Exact match" if passed else "No exact match found",
33
- mode="single",
34
- )
35
-
36
-
37
- def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
38
- """Count how many expected answers are in the response.
39
-
40
- Args:
41
- response: The response to evaluate
42
- answers: List of expected answers
43
-
44
- Returns:
45
- EvaluationResult with score=proportion of matches (0.0-1.0)
46
- """
47
- response_str = str(response).lower()
48
- matches = 0
49
-
50
- for answer in answers:
51
- if str(answer).lower() in response_str:
52
- matches += 1
53
-
54
- score = matches / len(answers) if answers else 0.0
55
-
56
- if matches == len(answers):
57
- reason = f"All {matches} expected items found"
58
- else:
59
- reason = f"Only {matches} of {len(answers)} expected items found"
60
-
61
- return EvaluationResult(score=score, reason=reason, mode="all")
62
-
63
-
64
- def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
65
- """Calculate similarity using Levenshtein distance.
66
-
67
- Args:
68
- response: The response to evaluate
69
- answer: The expected answer
70
-
71
- Returns:
72
- EvaluationResult with score=similarity (0.0-1.0)
73
- """
74
- s1 = str(response).lower()
75
- s2 = str(answer).lower()
76
-
77
- if s1 == s2:
78
- score = 1.0
79
- elif len(s1) == 0 or len(s2) == 0:
80
- score = 0.0
81
- else:
82
- # Use Levenshtein distance
83
- distance = levenshtein.distance(s1, s2)
84
- max_len = max(len(s1), len(s2))
85
- score = 1.0 - (distance / max_len)
86
-
87
- return EvaluationResult(
88
- score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
89
- )
90
-
91
-
92
- def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
93
- """Check if response matches regex pattern.
94
-
95
- Args:
96
- response: The response to evaluate
97
- pattern: Regular expression pattern to match
98
-
99
- Returns:
100
- EvaluationResult with score=1.0 if match, 0.0 otherwise
101
- """
102
- try:
103
- regex = re.compile(pattern, re.DOTALL)
104
- passed = bool(regex.search(str(response)))
105
- return EvaluationResult(
106
- score=1.0 if passed else 0.0,
107
- reason="Regex pattern matched" if passed else "Regex pattern did not match",
108
- mode="regex",
109
- )
110
- except re.error:
111
- return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
112
-
113
-
114
- def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
115
- """Compare difference between response and answer.
116
-
117
- Args:
118
- response: The response to evaluate
119
- answer: The expected answer
120
-
121
- Returns:
122
- EvaluationResult with score=similarity (0.0-1.0)
123
- """
124
- if isinstance(response, int | float) and isinstance(answer, int | float):
125
- score = _match_numeric_diff(response, answer)
126
- reason = f"Numeric difference: {abs(response - answer)}"
127
- else:
128
- score = _match_string_diff(response, answer)
129
- reason = f"String difference with {score:.1%} similarity"
130
-
131
- return EvaluationResult(score=score, reason=reason, mode="diff")
132
-
133
-
134
- def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
135
- """Compare difference between response and answer strings."""
136
- matcher = SequenceMatcher(None, str(response), str(answer))
137
- return matcher.ratio()
138
-
139
-
140
- def _match_numeric_diff(response: float, answer: float) -> float:
141
- """Calculate normalized difference between numeric values.
142
-
143
- Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
144
- """
145
- if response == answer:
146
- return 1.0
147
-
148
- # Simple absolute difference normalized to a 0-1 scale
149
- diff = abs(response - answer)
150
- max_val = max(abs(response), abs(answer))
151
-
152
- if max_val == 0:
153
- return 1.0 # Both are zero
154
-
155
- # Normalize and invert so 1.0 means identical
156
- return max(0.0, 1.0 - min(1.0, diff / max_val))
hud/evaluators/remote.py DELETED
@@ -1,65 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- from typing import Any
5
-
6
- from hud.evaluators.base import EvaluationResult
7
- from hud.server import make_request
8
- from hud.settings import settings
9
-
10
-
11
- async def _remote_eval_call(
12
- response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
13
- ) -> dict[str, Any]:
14
- """Send an evaluation request to the remote server.
15
-
16
- Args:
17
- response: The response to evaluate
18
- answer: The reference answer to compare against
19
- eval_type: Type of evaluation (e.g., "match", "judge", "agent")
20
- config: Optional configuration parameters
21
-
22
- Returns:
23
- Dictionary with evaluation results from the server
24
- """
25
- try:
26
- result = await make_request(
27
- method="POST",
28
- url=f"{settings.base_url}/evaluations/evaluate",
29
- json={
30
- "response": response,
31
- "answer": answer,
32
- "type": eval_type,
33
- "config": config or {},
34
- },
35
- api_key=settings.api_key,
36
- )
37
- return result
38
- except Exception as e:
39
- return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
40
-
41
-
42
- def remote_evaluate(
43
- response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
44
- ) -> EvaluationResult:
45
- """Evaluate a response using remote evaluation services.
46
-
47
- Args:
48
- response: The response to evaluate
49
- answer: The reference answer to compare against
50
- eval_type: Type of evaluation to perform
51
- config: Optional configuration for the evaluation
52
-
53
- Returns:
54
- EvaluationResult containing the evaluation results
55
- """
56
- result = asyncio.run(
57
- _remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
58
- )
59
-
60
- return EvaluationResult(
61
- score=result.get("score", -1.0),
62
- reason=result.get("reason", "Remote evaluation completed"),
63
- mode=eval_type,
64
- criteria_scores=result.get("details", {}),
65
- )
File without changes
@@ -1,12 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from hud.evaluators.inspect import inspect_evaluate
4
-
5
-
6
- def test_inspect_evaluate_basic():
7
- """Test basic functionality of inspect_evaluate."""
8
- result = inspect_evaluate("Test response", "Test answer")
9
-
10
- assert result.score == 0.0
11
- assert result.reason == "Inspect evaluation not implemented"
12
- assert result.mode == "inspect"