PyPI - hud-python - Versions diffs - 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

hud-python 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (59) hide show

hud/__init__.py +5 -3
hud/adapters/__init__.py +2 -1
hud/adapters/claude/adapter.py +13 -17
hud/adapters/common/adapter.py +3 -3
hud/adapters/common/tests/__init__.py +0 -0
hud/adapters/common/tests/test_adapter.py +277 -0
hud/adapters/common/types.py +3 -6
hud/adapters/operator/adapter.py +22 -29
hud/agent/__init__.py +9 -1
hud/agent/base.py +28 -28
hud/agent/claude.py +69 -60
hud/agent/langchain.py +204 -0
hud/agent/operator.py +75 -67
hud/env/__init__.py +5 -5
hud/env/client.py +2 -2
hud/env/docker_client.py +37 -39
hud/env/environment.py +91 -66
hud/env/local_docker_client.py +5 -7
hud/env/remote_client.py +40 -29
hud/env/remote_docker_client.py +13 -3
hud/evaluators/__init__.py +2 -3
hud/evaluators/base.py +4 -3
hud/evaluators/inspect.py +3 -8
hud/evaluators/judge.py +34 -58
hud/evaluators/match.py +42 -49
hud/evaluators/remote.py +13 -26
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +12 -0
hud/evaluators/tests/test_judge.py +231 -0
hud/evaluators/tests/test_match.py +115 -0
hud/evaluators/tests/test_remote.py +98 -0
hud/exceptions.py +167 -0
hud/gym.py +12 -10
hud/job.py +525 -47
hud/server/__init__.py +2 -2
hud/server/requests.py +148 -186
hud/server/tests/__init__.py +0 -0
hud/server/tests/test_requests.py +275 -0
hud/settings.py +3 -2
hud/task.py +12 -22
hud/taskset.py +44 -11
hud/trajectory.py +6 -9
hud/types.py +14 -9
hud/utils/__init__.py +2 -2
hud/utils/common.py +37 -13
hud/utils/config.py +44 -29
hud/utils/progress.py +149 -0
hud/utils/telemetry.py +10 -11
hud/utils/tests/__init__.py +0 -0
hud/utils/tests/test_common.py +52 -0
hud/utils/tests/test_config.py +129 -0
hud/utils/tests/test_progress.py +225 -0
hud/utils/tests/test_telemetry.py +37 -0
hud/utils/tests/test_version.py +8 -0
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/METADATA +44 -21
hud_python-0.2.3.dist-info/RECORD +62 -0
hud_python-0.2.1.dist-info/RECORD +0 -44
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/WHEEL +0 -0
{hud_python-0.2.1.dist-info → hud_python-0.2.3.dist-info}/licenses/LICENSE +0 -0

hud/evaluators/judge.py CHANGED Viewed

@@ -11,33 +11,26 @@ from hud.settings import settings
 class LLM(Protocol):
     """Protocol for LLM interfaces that can be used for evaluation."""
-    async def ainvoke(self, prompt: str) -> str: ...
+    async def ainvoke(self, prompt: str, /) -> str: ...
 class Criterion(TypedDict, total=False):
     """Criterion for judge-based evaluation."""
     description: str
     weight: float
 async def _call_eval_endpoint(
-    response: Any,
-    answer: Any,
-    criteria: list[Any],
-    mode: str
+    response: Any, answer: Any, criteria: list[Any], mode: str
 ) -> dict[str, Any]:
     """Call the run_eval endpoint to evaluate the response."""
     try:
         result = await make_request(
             method="POST",
             url=f"{settings.base_url}/evaluations/run_eval",
-            json={
-                "response": response,
-                "answer": answer,
-                "criteria": criteria,
-                "mode": mode
-            },
+            json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
             api_key=settings.api_key,
         )
         return result
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
         return {
             "score": -1.0,
             "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
-            "criteria_scores": {}
+            "criteria_scores": {},
         }
-def _determine_mode(answer: Any) -> str:
-    """Determine the evaluation mode based on answer type."""
-    if isinstance(answer, bytes) or _is_base64_image(answer):
-        return "VLM"
-    return "LLM"
 def _process_input(data: Any) -> Any:
     """Process input data, detecting and handling base64 images."""
     if isinstance(data, bytes):
         # Convert bytes to base64 string
         return base64.b64encode(data).decode("utf-8")
     if isinstance(data, str) and _is_base64_image(data):
         # It's already a base64 string, just return it
         return data
     if isinstance(data, list) and all(isinstance(item, str) for item in data):
         # Process list of strings
         return data
     # For other types, convert to string
     return str(data) if not isinstance(data, str | dict) else data
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
     """Check if a string is a base64 encoded image."""
     if not isinstance(data, str):
         return False
     # Check for common image data URI pattern
     if data.startswith(("data:image/", "data:application/octet-stream")):
         return True
     # Check if it's a base64 encoded string with image header
     try:
         # First, validate it's base64 decodable
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
         sample = base64.b64decode(data[:30])
         # Check for common image format signatures
-        return (
-            sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
-        )
+        return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
     except Exception:
         return False
@@ -109,50 +93,46 @@ def judge(
     criteria: list[str] | list[dict] | None = None,
 ) -> EvaluationResult:
     """Judge a response against an answer using an LLM.
     Args:
         response: The response to evaluate
         answer: The reference answer to compare against
         llm: Optional langchain LLM to use for evaluation
         criteria: Evaluation criteria as strings or dictionaries
     Returns:
         EvaluationResult with evaluation results
     """
     # Process inputs
     processed_response = _process_input(response)
     processed_answer = _process_input(answer)
     # If LLM is provided, use it for evaluation
     if llm:
         return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
     # Otherwise, use the remote evaluation service
     mode = "LLM"
     if isinstance(answer, bytes) or _is_base64_image(answer):
         mode = "VLM"
     # Call the eval endpoint synchronously
-    result = asyncio.run(_call_eval_endpoint(
-        response=processed_response,
-        answer=processed_answer,
-        criteria=criteria or [],
-        mode=mode
-    ))
+    result = asyncio.run(
+        _call_eval_endpoint(
+            response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
+        )
+    )
     return EvaluationResult(
         score=result.get("score", -1.0),
         reason=result.get("reason", "Response evaluated"),
         mode=mode,
-        criteria_scores=result.get("criteria_scores", {})
+        criteria_scores=result.get("criteria_scores", {}),
     )
 def _evaluate_with_llm(
-    response: Any,
-    answer: Any,
-    llm: LLM,
-    criteria: list[str] | list[dict] | None = None
+    response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
 ) -> EvaluationResult:
     """Evaluate a response against an answer using a provided LLM."""
     criteria_text = ""
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
                 criteria_text += f"- {c['description']}\n"
             elif isinstance(c, str):
                 criteria_text += f"- {c}\n"
     prompt = f"""Evaluate the quality of a response given a reference answer.
 REFERENCE ANSWER:
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
     try:
         # Run the evaluation asynchronously
         result_text = asyncio.run(llm.ainvoke(prompt))
         # Attempt to parse JSON response
         import json
         import re
         # Try to extract JSON if wrapped in other text
         json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
         if json_match:
             json_str = json_match.group(0)
             result = json.loads(json_str)
             return EvaluationResult(
                 score=float(result.get("score", 0.5)),
                 reason=result.get("reason", "Evaluated with custom LLM"),
-                mode="custom_llm"
+                mode="custom_llm",
             )
         # If can't parse as JSON, use default values
         return EvaluationResult(
             score=0.5,
             reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
-            mode="custom_llm"
+            mode="custom_llm",
         )
     except Exception as e:
-        return EvaluationResult(
-            score=0.0,
-            reason=f"LLM evaluation error: {e!s}",
-            mode="custom_llm"
-        )
+        return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")

hud/evaluators/match.py CHANGED Viewed

@@ -2,20 +2,27 @@ from __future__ import annotations
 import re
 from difflib import SequenceMatcher
-from typing import Any
+from typing import TYPE_CHECKING, Protocol
 from textdistance import levenshtein
 from hud.evaluators.base import EvaluationResult
+if TYPE_CHECKING:
+    from collections.abc import Sequence
-def match_single(response: Any, answer: Any) -> EvaluationResult:
+class _Stringable(Protocol):
+    def __str__(self) -> str: ...
+def match_single(response: _Stringable, answer: _Stringable) -> EvaluationResult:
     """Check if the answer is present within the response.
     Args:
         response: The response to evaluate
         answer: The expected answer
     Returns:
         EvaluationResult with score=1.0 if match, 0.0 otherwise
     """
@@ -23,54 +30,50 @@ def match_single(response: Any, answer: Any) -> EvaluationResult:
     return EvaluationResult(
         score=1.0 if passed else 0.0,
         reason="Exact match" if passed else "No exact match found",
-        mode="single"
+        mode="single",
     )
-def match_all(response: Any, answers: list) -> EvaluationResult:
+def match_all(response: _Stringable, answers: Sequence[_Stringable]) -> EvaluationResult:
     """Count how many expected answers are in the response.
     Args:
         response: The response to evaluate
         answers: List of expected answers
     Returns:
         EvaluationResult with score=proportion of matches (0.0-1.0)
     """
     response_str = str(response).lower()
     matches = 0
     for answer in answers:
         if str(answer).lower() in response_str:
             matches += 1
     score = matches / len(answers) if answers else 0.0
     if matches == len(answers):
         reason = f"All {matches} expected items found"
     else:
         reason = f"Only {matches} of {len(answers)} expected items found"
-    return EvaluationResult(
-        score=score,
-        reason=reason,
-        mode="all"
-    )
+    return EvaluationResult(score=score, reason=reason, mode="all")
-def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
+def match_fuzzy(response: _Stringable, answer: _Stringable) -> EvaluationResult:
     """Calculate similarity using Levenshtein distance.
     Args:
         response: The response to evaluate
         answer: The expected answer
     Returns:
         EvaluationResult with score=similarity (0.0-1.0)
     """
     s1 = str(response).lower()
     s2 = str(answer).lower()
     if s1 == s2:
         score = 1.0
     elif len(s1) == 0 or len(s2) == 0:
@@ -80,21 +83,19 @@ def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
         distance = levenshtein.distance(s1, s2)
         max_len = max(len(s1), len(s2))
         score = 1.0 - (distance / max_len)
     return EvaluationResult(
-        score=score,
-        reason=f"Fuzzy match with {score:.1%} similarity",
-        mode="fuzz"
+        score=score, reason=f"Fuzzy match with {score:.1%} similarity", mode="fuzz"
     )
-def match_regex(response: Any, pattern: str) -> EvaluationResult:
+def match_regex(response: _Stringable, pattern: str) -> EvaluationResult:
     """Check if response matches regex pattern.
     Args:
         response: The response to evaluate
         pattern: Regular expression pattern to match
     Returns:
         EvaluationResult with score=1.0 if match, 0.0 otherwise
     """
@@ -104,23 +105,19 @@ def match_regex(response: Any, pattern: str) -> EvaluationResult:
         return EvaluationResult(
             score=1.0 if passed else 0.0,
             reason="Regex pattern matched" if passed else "Regex pattern did not match",
-            mode="regex"
+            mode="regex",
         )
     except re.error:
-        return EvaluationResult(
-            score=0.0,
-            reason="Invalid regex pattern",
-            mode="regex"
-        )
+        return EvaluationResult(score=0.0, reason="Invalid regex pattern", mode="regex")
-def match_diff(response: Any, answer: Any) -> EvaluationResult:
+def match_diff(response: _Stringable, answer: _Stringable) -> EvaluationResult:
     """Compare difference between response and answer.
     Args:
         response: The response to evaluate
         answer: The expected answer
     Returns:
         EvaluationResult with score=similarity (0.0-1.0)
     """
@@ -130,34 +127,30 @@ def match_diff(response: Any, answer: Any) -> EvaluationResult:
     else:
         score = _match_string_diff(response, answer)
         reason = f"String difference with {score:.1%} similarity"
-    return EvaluationResult(
-        score=score,
-        reason=reason,
-        mode="diff"
-    )
+    return EvaluationResult(score=score, reason=reason, mode="diff")
-def _match_string_diff(response: Any, answer: Any) -> float:
+def _match_string_diff(response: _Stringable, answer: _Stringable) -> float:
     """Compare difference between response and answer strings."""
     matcher = SequenceMatcher(None, str(response), str(answer))
     return matcher.ratio()
 def _match_numeric_diff(response: float, answer: float) -> float:
     """Calculate normalized difference between numeric values.
     Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
     """
     if response == answer:
         return 1.0
     # Simple absolute difference normalized to a 0-1 scale
     diff = abs(response - answer)
     max_val = max(abs(response), abs(answer))
     if max_val == 0:
         return 1.0  # Both are zero
     # Normalize and invert so 1.0 means identical
     return max(0.0, 1.0 - min(1.0, diff / max_val))

hud/evaluators/remote.py CHANGED Viewed

@@ -9,19 +9,16 @@ from hud.settings import settings
 async def _remote_eval_call(
-    response: Any,
-    answer: Any,
-    eval_type: str,
-    config: dict[str, Any] | None = None
+    response: Any, answer: Any, eval_type: str, config: dict[str, Any] | None = None
 ) -> dict[str, Any]:
     """Send an evaluation request to the remote server.
     Args:
         response: The response to evaluate
         answer: The reference answer to compare against
         eval_type: Type of evaluation (e.g., "match", "judge", "agent")
         config: Optional configuration parameters
     Returns:
         Dictionary with evaluation results from the server
     """
@@ -33,46 +30,36 @@ async def _remote_eval_call(
                 "response": response,
                 "answer": answer,
                 "type": eval_type,
-                "config": config or {}
+                "config": config or {},
             },
             api_key=settings.api_key,
         )
         return result
     except Exception as e:
-        return {
-            "score": -1.0,
-            "reason": f"Remote evaluation failed: {e!s}",
-            "details": {}
-        }
+        return {"score": -1.0, "reason": f"Remote evaluation failed: {e!s}", "details": {}}
 def remote_evaluate(
-    response: Any,
-    answer: Any,
-    eval_type: str = "default",
-    config: dict[str, Any] | None = None
+    response: Any, answer: Any, eval_type: str = "default", config: dict[str, Any] | None = None
 ) -> EvaluationResult:
     """Evaluate a response using remote evaluation services.
     Args:
         response: The response to evaluate
         answer: The reference answer to compare against
         eval_type: Type of evaluation to perform
         config: Optional configuration for the evaluation
     Returns:
         EvaluationResult containing the evaluation results
     """
-    result = asyncio.run(_remote_eval_call(
-        response=response,
-        answer=answer,
-        eval_type=eval_type,
-        config=config
-    ))
+    result = asyncio.run(
+        _remote_eval_call(response=response, answer=answer, eval_type=eval_type, config=config)
+    )
     return EvaluationResult(
         score=result.get("score", -1.0),
         reason=result.get("reason", "Remote evaluation completed"),
         mode=eval_type,
-        criteria_scores=result.get("details", {})
+        criteria_scores=result.get("details", {}),
     )

hud/evaluators/tests/__init__.py ADDED Viewed

File without changes

hud/evaluators/tests/test_inspect.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+from hud.evaluators.inspect import inspect_evaluate
+def test_inspect_evaluate_basic():
+    """Test basic functionality of inspect_evaluate."""
+    result = inspect_evaluate("Test response", "Test answer")
+    assert result.score == 0.0
+    assert result.reason == "Inspect evaluation not implemented"
+    assert result.mode == "inspect"

hud-python 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.1py3-none-any.whl → 0.2.3py3-none-any.whl