PyPI - hud-python - Versions diffs - 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (58) hide show

hud/__init__.py +4 -3
hud/adapters/claude/adapter.py +5 -14
hud/adapters/common/adapter.py +3 -3
hud/adapters/common/tests/__init__.py +0 -0
hud/adapters/common/tests/test_adapter.py +277 -0
hud/adapters/common/types.py +3 -3
hud/adapters/operator/adapter.py +16 -23
hud/agent/__init__.py +8 -1
hud/agent/base.py +28 -28
hud/agent/claude.py +69 -60
hud/agent/langchain.py +32 -26
hud/agent/operator.py +75 -67
hud/env/__init__.py +5 -5
hud/env/client.py +2 -2
hud/env/docker_client.py +37 -39
hud/env/environment.py +91 -66
hud/env/local_docker_client.py +5 -7
hud/env/remote_client.py +39 -32
hud/env/remote_docker_client.py +13 -3
hud/evaluators/__init__.py +2 -3
hud/evaluators/base.py +4 -3
hud/evaluators/inspect.py +3 -8
hud/evaluators/judge.py +34 -58
hud/evaluators/match.py +42 -49
hud/evaluators/remote.py +13 -26
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +12 -0
hud/evaluators/tests/test_judge.py +231 -0
hud/evaluators/tests/test_match.py +115 -0
hud/evaluators/tests/test_remote.py +98 -0
hud/exceptions.py +167 -0
hud/gym.py +9 -7
hud/job.py +179 -109
hud/server/__init__.py +2 -2
hud/server/requests.py +148 -186
hud/server/tests/__init__.py +0 -0
hud/server/tests/test_requests.py +275 -0
hud/settings.py +3 -2
hud/task.py +9 -19
hud/taskset.py +44 -11
hud/trajectory.py +6 -9
hud/types.py +12 -9
hud/utils/__init__.py +2 -2
hud/utils/common.py +36 -15
hud/utils/config.py +45 -30
hud/utils/progress.py +34 -21
hud/utils/telemetry.py +10 -11
hud/utils/tests/__init__.py +0 -0
hud/utils/tests/test_common.py +52 -0
hud/utils/tests/test_config.py +129 -0
hud/utils/tests/test_progress.py +225 -0
hud/utils/tests/test_telemetry.py +37 -0
hud/utils/tests/test_version.py +8 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/METADATA +9 -6
hud_python-0.2.4.dist-info/RECORD +62 -0
hud_python-0.2.2.dist-info/RECORD +0 -46
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/WHEEL +0 -0
{hud_python-0.2.2.dist-info → hud_python-0.2.4.dist-info}/licenses/LICENSE +0 -0

hud/env/remote_client.py CHANGED Viewed

@@ -5,23 +5,25 @@ from base64 import b64decode
 from typing import TYPE_CHECKING, Any
 from hud.env.client import Client
+from hud.exceptions import HudResponseError
 from hud.server import make_request
 from hud.settings import settings
 from hud.types import EnvironmentStatus
 from hud.utils import ExecuteResult
 if TYPE_CHECKING:
-    from hud.utils.config import HudStyleConfig
+    from hud.utils.config import FunctionConfig
 logger = logging.getLogger("hud.env.remote_env_client")
 class RemoteClient(Client):
     """
     Remote environment client implementation.
     Uses the HUD API to manage a remote environment.
     """
     @classmethod
     async def create(
         cls,
@@ -33,21 +35,23 @@ class RemoteClient(Client):
     ) -> tuple[RemoteClient, dict[str, Any]]:
         """
         Creates a remote environment client from a dockerfile or gym_id.
         Args:
             dockerfile: The dockerfile content to build the environment
             gym_id: The gym_id of the environment to create
             metadata: Metadata to associate with the environment
         Returns:
-            RemoteClient: An instance of the remote environment client
+            A tuple containing the remote environment client and the build metadata
+        Raises:
+            HudResponseError: If the environment creation is successful but the response is invalid.
         """
         # Validate arguments
         if metadata is None:
             metadata = {}
         request_data = {
             # still named run_id for backwards compatibility
             "run_id": job_id,
@@ -63,33 +67,38 @@ class RemoteClient(Client):
             json=request_data,
             api_key=settings.api_key,
         )
         # Get the environment ID from the response
         env_id = response.get("id")
         if not env_id:
-            raise ValueError("Failed to create remote environment: No ID returned")
+            raise HudResponseError(
+                message="Failed to create remote environment: No ID returned in API response. "
+                "Please contact support if this issue persists.",
+                response_json=response,
+            )
         # Create the controller instance
         controller = cls(env_id)
         build_data = response.get("metadata", {})
         if response.get("readme"):
-            logger.info("[HUD] %s gym created, see how to use it at %s", gym_id,
-                        response.get("readme"))
+            logger.info(
+                "[HUD] %s gym created, see how to use it at %s", gym_id, response.get("readme")
+            )
         return controller, build_data
     def __init__(self, env_id: str) -> None:
         """
         Initialize the RemoteClient.
         Args:
             env_id: ID of the remote environment to control
         """
         super().__init__()
         self._env_id = env_id
     @property
     def env_id(self) -> str:
         """The ID of the remote environment."""
@@ -98,7 +107,7 @@ class RemoteClient(Client):
     async def get_status(self) -> EnvironmentStatus:
         """
         Get the current status of the remote environment.
         Returns:
             EnvironmentStatus: The current status of the environment
         """
@@ -111,7 +120,7 @@ class RemoteClient(Client):
             logger.debug("Environment status response: %s", response)
             status = response.get("state", "").lower()
             if status == "running":
                 return EnvironmentStatus.RUNNING
             elif status == "initializing" or status == "pending":
@@ -122,12 +131,12 @@ class RemoteClient(Client):
                 # Any other status is considered an error
                 logger.warning("Abnormal environment status response: %s", response)
                 return EnvironmentStatus.ERROR
         except Exception:
             # If we can't connect to the API or there's any other error
             logger.info("(potentially transient) Error getting environment status")
             return EnvironmentStatus.ERROR
     async def execute(
         self,
         command: list[str],
@@ -138,11 +147,11 @@ class RemoteClient(Client):
         """
         Execute a command in the environment.
         No-op in some environments (like browser use).
         Args:
             command: Command to execute
             workdir: Working directory for the command (ignored for remote environments)
         Returns:
             ExecuteResult: Result of the command execution
         """
@@ -150,21 +159,20 @@ class RemoteClient(Client):
             method="POST",
             url=f"{settings.base_url}/v2/environments/{self.env_id}/execute",
             json={
-               "command": command,
-               "workdir": workdir,
-               "timeout": timeout,
+                "command": command,
+                "workdir": workdir,
+                "timeout": timeout,
             },
             api_key=settings.api_key,
         )
         return ExecuteResult(
             stdout=b64decode(data["stdout"]),
             stderr=b64decode(data["stderr"]),
-            exit_code=data["exit_code"]
+            exit_code=data["exit_code"],
         )
-    async def invoke(self, config: HudStyleConfig) -> tuple[Any, bytes, bytes]:
+    async def invoke(self, config: FunctionConfig) -> tuple[Any, bytes, bytes]:
         """
         Invoke a function in the environment.
         """
@@ -174,9 +182,8 @@ class RemoteClient(Client):
             json=config.model_dump(),
             api_key=settings.api_key,
         )
-        return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
+        return data["result"], b64decode(data["stdout"]), b64decode(data["stderr"])
     async def close(self) -> None:
         """

hud/env/remote_docker_client.py CHANGED Viewed

@@ -5,6 +5,7 @@ from base64 import b64decode, b64encode
 from typing import Any
 from hud.env.docker_client import DockerClient
+from hud.exceptions import HudResponseError
 from hud.server import make_request
 from hud.settings import settings
 from hud.types import EnvironmentStatus
@@ -39,7 +40,10 @@ class RemoteDockerClient(DockerClient):
             metadata: Metadata to associate with the environment
         Returns:
-            RemoteClient: An instance of the remote environment client
+            A tuple containing the remote environment client and the build metadata
+        Raises:
+            HudResponseError: If the environment creation fails.
         """
         # Validate arguments
@@ -48,7 +52,7 @@ class RemoteDockerClient(DockerClient):
         logger.info("Creating remote environment")
-        true_gym_id = await get_gym_id("local-docker")
+        true_gym_id = await get_gym_id("docker")
         # augment metadata with dockerfile
         if "environment_config" not in metadata:
@@ -73,7 +77,13 @@ class RemoteDockerClient(DockerClient):
         # Get the environment ID from the response
         env_id = response.get("id")
         if not env_id:
-            raise ValueError("Failed to create remote environment: No ID returned")
+            raise HudResponseError(
+                message=(
+                    "Failed to create remote environment: No ID returned in API response. "
+                    "Please contact support if this issue persists."
+                ),
+                response_json=response,
+            )
         # Create the controller instance
         controller = cls(env_id)

hud/evaluators/__init__.py CHANGED Viewed

@@ -1,10 +1,9 @@
 """
 Evaluators for assessing task responses.
 """
 from __future__ import annotations
 from hud.evaluators.base import Evaluator
-__all__ = [
-    "Evaluator"
-]
+__all__ = ["Evaluator"]

hud/evaluators/base.py CHANGED Viewed

@@ -11,21 +11,22 @@ if TYPE_CHECKING:
 class EvaluationResult(BaseModel):
     """Result of an evaluation.
     Attributes:
         score: Float score between 0 and 1
         reason: Explanation of the evaluation
         mode: Mode used for matching, if applicable
     """
     score: float
     reason: str
     mode: str | None = None
     criteria_scores: dict[str, float] | None = Field(default_factory=dict)
 class Evaluator(ABC):
     """Abstract base class for evaluators."""
     @abstractmethod
     def evaluate(self, task: Task, response: str) -> EvaluationResult:
         """Evaluate a task and response."""

hud/evaluators/inspect.py CHANGED Viewed

@@ -10,20 +10,15 @@ def inspect_evaluate(
     answer: Any,
 ) -> EvaluationResult:
     """Evaluate using Inspect-ai's evaluation models.
     Args:
         response: The response to evaluate
         answer: The reference answer to compare against
         model_name: The Inspect model to use
         prompt: Optional custom prompt for evaluation
         metrics: Optional list of metrics to evaluate against
     Returns:
         EvaluationResult with the evaluation results
     """
-    return EvaluationResult(
-        score=0.0,
-        reason="Inspect evaluation not implemented",
-        mode="inspect"
-    )
+    return EvaluationResult(score=0.0, reason="Inspect evaluation not implemented", mode="inspect")

hud/evaluators/judge.py CHANGED Viewed

@@ -11,33 +11,26 @@ from hud.settings import settings
 class LLM(Protocol):
     """Protocol for LLM interfaces that can be used for evaluation."""
-    async def ainvoke(self, prompt: str) -> str: ...
+    async def ainvoke(self, prompt: str, /) -> str: ...
 class Criterion(TypedDict, total=False):
     """Criterion for judge-based evaluation."""
     description: str
     weight: float
 async def _call_eval_endpoint(
-    response: Any,
-    answer: Any,
-    criteria: list[Any],
-    mode: str
+    response: Any, answer: Any, criteria: list[Any], mode: str
 ) -> dict[str, Any]:
     """Call the run_eval endpoint to evaluate the response."""
     try:
         result = await make_request(
             method="POST",
             url=f"{settings.base_url}/evaluations/run_eval",
-            json={
-                "response": response,
-                "answer": answer,
-                "criteria": criteria,
-                "mode": mode
-            },
+            json={"response": response, "answer": answer, "criteria": criteria, "mode": mode},
             api_key=settings.api_key,
         )
         return result
@@ -46,31 +39,24 @@ async def _call_eval_endpoint(
         return {
             "score": -1.0,
             "reason": f"Remote evaluation failed: {e!s}. Fallback to default score.",
-            "criteria_scores": {}
+            "criteria_scores": {},
         }
-def _determine_mode(answer: Any) -> str:
-    """Determine the evaluation mode based on answer type."""
-    if isinstance(answer, bytes) or _is_base64_image(answer):
-        return "VLM"
-    return "LLM"
 def _process_input(data: Any) -> Any:
     """Process input data, detecting and handling base64 images."""
     if isinstance(data, bytes):
         # Convert bytes to base64 string
         return base64.b64encode(data).decode("utf-8")
     if isinstance(data, str) and _is_base64_image(data):
         # It's already a base64 string, just return it
         return data
     if isinstance(data, list) and all(isinstance(item, str) for item in data):
         # Process list of strings
         return data
     # For other types, convert to string
     return str(data) if not isinstance(data, str | dict) else data
@@ -79,11 +65,11 @@ def _is_base64_image(data: Any) -> bool:
     """Check if a string is a base64 encoded image."""
     if not isinstance(data, str):
         return False
     # Check for common image data URI pattern
     if data.startswith(("data:image/", "data:application/octet-stream")):
         return True
     # Check if it's a base64 encoded string with image header
     try:
         # First, validate it's base64 decodable
@@ -95,9 +81,7 @@ def _is_base64_image(data: Any) -> bool:
         sample = base64.b64decode(data[:30])
         # Check for common image format signatures
-        return (
-            sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
-        )
+        return sample.startswith((b"\xff\xd8\xff", b"\x89PNG\r\n\x1a\n", b"GIF8", b"RIFF"))
     except Exception:
         return False
@@ -109,50 +93,46 @@ def judge(
     criteria: list[str] | list[dict] | None = None,
 ) -> EvaluationResult:
     """Judge a response against an answer using an LLM.
     Args:
         response: The response to evaluate
         answer: The reference answer to compare against
         llm: Optional langchain LLM to use for evaluation
         criteria: Evaluation criteria as strings or dictionaries
     Returns:
         EvaluationResult with evaluation results
     """
     # Process inputs
     processed_response = _process_input(response)
     processed_answer = _process_input(answer)
     # If LLM is provided, use it for evaluation
     if llm:
         return _evaluate_with_llm(processed_response, processed_answer, llm, criteria)
     # Otherwise, use the remote evaluation service
     mode = "LLM"
     if isinstance(answer, bytes) or _is_base64_image(answer):
         mode = "VLM"
     # Call the eval endpoint synchronously
-    result = asyncio.run(_call_eval_endpoint(
-        response=processed_response,
-        answer=processed_answer,
-        criteria=criteria or [],
-        mode=mode
-    ))
+    result = asyncio.run(
+        _call_eval_endpoint(
+            response=processed_response, answer=processed_answer, criteria=criteria or [], mode=mode
+        )
+    )
     return EvaluationResult(
         score=result.get("score", -1.0),
         reason=result.get("reason", "Response evaluated"),
         mode=mode,
-        criteria_scores=result.get("criteria_scores", {})
+        criteria_scores=result.get("criteria_scores", {}),
     )
 def _evaluate_with_llm(
-    response: Any,
-    answer: Any,
-    llm: LLM,
-    criteria: list[str] | list[dict] | None = None
+    response: Any, answer: Any, llm: LLM, criteria: list[str] | list[dict] | None = None
 ) -> EvaluationResult:
     """Evaluate a response against an answer using a provided LLM."""
     criteria_text = ""
@@ -163,7 +143,7 @@ def _evaluate_with_llm(
                 criteria_text += f"- {c['description']}\n"
             elif isinstance(c, str):
                 criteria_text += f"- {c}\n"
     prompt = f"""Evaluate the quality of a response given a reference answer.
 REFERENCE ANSWER:
@@ -181,33 +161,29 @@ Format your answer as a JSON object with 'score' (float) and 'reason' (string) f
     try:
         # Run the evaluation asynchronously
         result_text = asyncio.run(llm.ainvoke(prompt))
         # Attempt to parse JSON response
         import json
         import re
         # Try to extract JSON if wrapped in other text
         json_match = re.search(r"\{.*?\}", result_text, re.DOTALL)
         if json_match:
             json_str = json_match.group(0)
             result = json.loads(json_str)
             return EvaluationResult(
                 score=float(result.get("score", 0.5)),
                 reason=result.get("reason", "Evaluated with custom LLM"),
-                mode="custom_llm"
+                mode="custom_llm",
             )
         # If can't parse as JSON, use default values
         return EvaluationResult(
             score=0.5,
             reason=f"Unable to parse LLM response as JSON. Raw response: {result_text[:100]}...",
-            mode="custom_llm"
+            mode="custom_llm",
         )
     except Exception as e:
-        return EvaluationResult(
-            score=0.0,
-            reason=f"LLM evaluation error: {e!s}",
-            mode="custom_llm"
-        )
+        return EvaluationResult(score=0.0, reason=f"LLM evaluation error: {e!s}", mode="custom_llm")

hud-python 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.2py3-none-any.whl → 0.2.4py3-none-any.whl