PyPI - hud-python - Versions diffs - 0.4.23__tar.gz → 0.4.24__tar.gz - Mend

hud-python 0.4.23tar.gz → 0.4.24tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (201) hide show

{hud_python-0.4.23 → hud_python-0.4.24}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.23
+Version: 0.4.24
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/base.py RENAMED Viewed

@@ -207,6 +207,7 @@ class MCPAgent(ABC):
             else:
                 raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
         except Exception as e:
+            # Always return a Trace object for any exception
             if self._is_connection_error(e):
                 # Return error trace for connection failures
                 return Trace(
@@ -215,7 +216,15 @@ class MCPAgent(ABC):
                     content=self._get_connection_error_message(e),
                     isError=True,
                 )
-            raise
+            else:
+                # Return error trace for any other exception
+                return Trace(
+                    reward=0.0,
+                    done=True,
+                    content=f"Task failed with error: {e}",
+                    isError=True,
+                    info={"error": str(e)},
+                )
         finally:
             # Cleanup auto-created resources
             await self._cleanup()
@@ -262,34 +271,53 @@ class MCPAgent(ABC):
             prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
             prompt_result.populate_from_context()
-        # Always evaluate if we have a prompt result and evaluate tool
-        if prompt_result is not None and task.evaluate_tool is not None:
+        # Always evaluate if we have evaluate tool, regardless of errors
+        if task.evaluate_tool is not None:
             try:
                 self.console.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
                 results = await self.call_tools(task.evaluate_tool)
                 if any(result.isError for result in results):
-                    raise RuntimeError(f"{results}")
-                # Extract reward and content from evaluation
-                if results:
-                    reward = find_reward(results[0])
-                    eval_content = find_content(results[0])
-                    # Update the prompt result with evaluation reward
-                    prompt_result.reward = reward
-                    # Update the prompt result with evaluation content (if available)
-                    if eval_content:
-                        # Prompt result may already have final response content, so we append to it
-                        if prompt_result.content:
-                            prompt_result.content += "\n\n" + eval_content
+                    self.console.warning_log(f"Evaluate tool returned error: {results}")
+                    # Still extract what we can from the error response
+                    if prompt_result is None:
+                        prompt_result = Trace(
+                            reward=0.0,
+                            done=True,
+                            content="Task failed before evaluation",
+                            isError=True,
+                        )
+                    prompt_result.reward = 0.0  # Default to 0 on error
+                else:
+                    # Extract reward and content from evaluation
+                    if results:
+                        reward = find_reward(results[0])
+                        eval_content = find_content(results[0])
+                        # Update the prompt result with evaluation reward
+                        if prompt_result is None:
+                            prompt_result = Trace(
+                                reward=reward, done=True, content=eval_content or "", isError=False
+                            )
                         else:
-                            prompt_result.content = eval_content
+                            prompt_result.reward = reward
+                            # Update the prompt result with evaluation content (if available)
+                            if eval_content:
+                                # Prompt result may already have final response content,
+                                # so we append to it
+                                if prompt_result.content:
+                                    prompt_result.content += "\n\n" + eval_content
+                                else:
+                                    prompt_result.content = eval_content
             except Exception as e:
                 self.console.error_log(f"Evaluation phase failed: {e}")
-                # Continue with the prompt result even if evaluation failed
+                # Ensure we have a result even if evaluation failed
+                if prompt_result is None:
+                    prompt_result = Trace(
+                        reward=0.0, done=True, content=f"Evaluation failed: {e}", isError=True
+                    )
         return (
             prompt_result

{hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/claude.py RENAMED Viewed

@@ -196,7 +196,11 @@ class ClaudeAgent(MCPAgent):
                 response = await self.anthropic_client.beta.messages.create(**create_kwargs)
                 break
             except BadRequestError as e:
-                if e.message.startswith("prompt is too long"):
+                if (
+                    "prompt is too long" in str(e)
+                    or "request_too_large" in str(e)
+                    or e.status_code == 413
+                ):
                     logger.warning("Prompt too long, truncating message history")
                     # Keep first message and last 20 messages
                     if len(current_messages) > 21:

{hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/mcp_use.py RENAMED Viewed

@@ -15,6 +15,7 @@ from hud.types import MCPToolCall, MCPToolResult
 from hud.version import __version__ as hud_version
 from .base import BaseHUDClient
+from .utils.retry import retry_with_backoff
 logger = logging.getLogger(__name__)
@@ -127,8 +128,11 @@ class MCPUseHUDClient(BaseHUDClient):
                     logger.warning("Client session not initialized for %s", server_name)
                     continue
-                # List tools
-                tools_result = await session.connector.client_session.list_tools()
+                # List tools with retry logic for HTTP errors
+                tools_result = await retry_with_backoff(
+                    session.connector.client_session.list_tools,
+                    operation_name=f"list_tools_{server_name}",
+                )
                 logger.info(
                     "Discovered %d tools from '%s': %s",
@@ -202,9 +206,12 @@ class MCPUseHUDClient(BaseHUDClient):
         if session.connector.client_session is None:
             raise ValueError(f"Client session not initialized for {server_name}")
-        result = await session.connector.client_session.call_tool(
+        # Call tool with retry logic for HTTP errors (502, 503, 504)
+        result = await retry_with_backoff(
+            session.connector.client_session.call_tool,
             name=original_tool.name,  # Use original tool name, not prefixed
             arguments=tool_call.arguments or {},
+            operation_name=f"call_tool_{original_tool.name}",
         )
         if self.verbose:
@@ -232,7 +239,10 @@ class MCPUseHUDClient(BaseHUDClient):
                     continue
                 # Prefer standard method name if available
                 if hasattr(session.connector.client_session, "list_resources"):
-                    resources = await session.connector.client_session.list_resources()
+                    resources = await retry_with_backoff(
+                        session.connector.client_session.list_resources,
+                        operation_name=f"list_resources_{server_name}",
+                    )
                 else:
                     # If the client doesn't support resource listing, skip
                     continue
@@ -262,7 +272,11 @@ class MCPUseHUDClient(BaseHUDClient):
                 resource_uri = AnyUrl(uri) if isinstance(uri, str) else uri
                 # Prefer read_resource; fall back to list_resources if needed
                 if hasattr(session.connector.client_session, "read_resource"):
-                    result = await session.connector.client_session.read_resource(resource_uri)
+                    result = await retry_with_backoff(
+                        session.connector.client_session.read_resource,
+                        resource_uri,
+                        operation_name=f"read_resource_{server_name}",
+                    )
                 else:
                     # Fallback path for older clients: not supported in strict typing
                     raise AttributeError("read_resource not available")

hud_python-0.4.24/hud/clients/utils/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""HUD MCP client utilities."""
+from __future__ import annotations
+from .retry import (
+    DEFAULT_BACKOFF_FACTOR,
+    DEFAULT_MAX_RETRIES,
+    DEFAULT_RETRY_DELAY,
+    DEFAULT_RETRY_STATUS_CODES,
+    is_retryable_error,
+    retry_with_backoff,
+    with_retry,
+)
+from .retry_transport import RetryTransport, create_retry_httpx_client
+__all__ = [
+    "DEFAULT_BACKOFF_FACTOR",
+    "DEFAULT_MAX_RETRIES",
+    "DEFAULT_RETRY_DELAY",
+    "DEFAULT_RETRY_STATUS_CODES",
+    "RetryTransport",
+    "create_retry_httpx_client",
+    "is_retryable_error",
+    "retry_with_backoff",
+    "with_retry",
+]

hud_python-0.4.24/hud/clients/utils/retry.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Shared retry utilities for MCP client operations."""
+from __future__ import annotations
+import asyncio
+import logging
+from functools import wraps
+from typing import TYPE_CHECKING, Any, TypeVar
+if TYPE_CHECKING:
+    from collections.abc import Callable
+from httpx import HTTPStatusError
+from mcp.shared.exceptions import McpError
+logger = logging.getLogger(__name__)
+T = TypeVar("T")
+# Default retry configuration matching requests.py
+DEFAULT_MAX_RETRIES = 4
+DEFAULT_RETRY_DELAY = 2.0
+DEFAULT_RETRY_STATUS_CODES = {502, 503, 504}
+DEFAULT_BACKOFF_FACTOR = 2.0
+def is_retryable_error(error: Exception, retry_status_codes: set[int]) -> bool:
+    """
+    Check if an error is retryable based on status codes.
+    Args:
+        error: The exception to check
+        retry_status_codes: Set of HTTP status codes to retry on
+    Returns:
+        True if the error is retryable, False otherwise
+    """
+    # Check for HTTP status errors with retryable status codes
+    if isinstance(error, HTTPStatusError):
+        return error.response.status_code in retry_status_codes
+    # Check for MCP errors that might wrap HTTP errors
+    if isinstance(error, McpError):
+        error_msg = str(error).lower()
+        # Check for common gateway error patterns in the message
+        for code in retry_status_codes:
+            if str(code) in error_msg:
+                return True
+        # Check for gateway error keywords
+        if any(
+            keyword in error_msg
+            for keyword in ["bad gateway", "service unavailable", "gateway timeout"]
+        ):
+            return True
+    # Check for generic errors with status codes in the message
+    error_msg = str(error)
+    for code in retry_status_codes:
+        if f"{code}" in error_msg or f"status {code}" in error_msg.lower():
+            return True
+    return False
+async def retry_with_backoff(
+    func: Callable[..., Any],
+    *args: Any,
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    retry_delay: float = DEFAULT_RETRY_DELAY,
+    retry_status_codes: set[int] | None = None,
+    backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
+    operation_name: str | None = None,
+    **kwargs: Any,
+) -> Any:
+    """
+    Execute an async function with retry logic and exponential backoff.
+    This matches the retry behavior in requests.py but can be applied
+    to any async function, particularly MCP client operations.
+    Args:
+        func: The async function to retry
+        *args: Positional arguments for the function
+        max_retries: Maximum number of retry attempts
+        retry_delay: Initial delay between retries in seconds
+        retry_status_codes: HTTP status codes to retry on
+        backoff_factor: Multiplier for exponential backoff
+        operation_name: Name of the operation for logging
+        **kwargs: Keyword arguments for the function
+    Returns:
+        The result of the function call
+    Raises:
+        The last exception if all retries are exhausted
+    """
+    if retry_status_codes is None:
+        retry_status_codes = DEFAULT_RETRY_STATUS_CODES
+    operation = operation_name or func.__name__
+    last_error = None
+    for attempt in range(max_retries + 1):
+        try:
+            result = await func(*args, **kwargs)
+            return result
+        except Exception as e:
+            last_error = e
+            # Check if this is a retryable error
+            if not is_retryable_error(e, retry_status_codes):
+                # Not retryable, raise immediately
+                raise
+            # Don't retry if we've exhausted attempts
+            if attempt >= max_retries:
+                logger.debug(
+                    "Operation '%s' failed after %d retries: %s",
+                    operation,
+                    max_retries,
+                    e,
+                )
+                raise
+            # Calculate backoff delay (exponential backoff)
+            delay = retry_delay * (backoff_factor**attempt)
+            logger.warning(
+                "Operation '%s' failed with retryable error, "
+                "retrying in %.2f seconds (attempt %d/%d): %s",
+                operation,
+                delay,
+                attempt + 1,
+                max_retries,
+                e,
+            )
+            await asyncio.sleep(delay)
+    # This should never be reached, but just in case
+    if last_error:
+        raise last_error
+    raise RuntimeError(f"Unexpected retry loop exit for operation '{operation}'")
+def with_retry(
+    max_retries: int = DEFAULT_MAX_RETRIES,
+    retry_delay: float = DEFAULT_RETRY_DELAY,
+    retry_status_codes: set[int] | None = None,
+    backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
+) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
+    """
+    Decorator to add retry logic to async methods.
+    Usage:
+        @with_retry(max_retries=3)
+        async def my_method(self, ...):
+            ...
+    Args:
+        max_retries: Maximum number of retry attempts
+        retry_delay: Initial delay between retries
+        retry_status_codes: HTTP status codes to retry on
+        backoff_factor: Multiplier for exponential backoff
+    Returns:
+        Decorated function with retry logic
+    """
+    def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(func)
+        async def wrapper(*args: Any, **kwargs: Any) -> Any:
+            return await retry_with_backoff(
+                func,
+                *args,
+                max_retries=max_retries,
+                retry_delay=retry_delay,
+                retry_status_codes=retry_status_codes,
+                backoff_factor=backoff_factor,
+                operation_name=func.__name__,
+                **kwargs,
+            )
+        return wrapper
+    return decorator

{hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/execution/parallel.py RENAMED Viewed

@@ -114,36 +114,58 @@ def _process_worker(
                     task_name = task_dict.get("prompt") or f"Task {index}"
                     # Use the job_id to group all tasks under the same job
-                    with hud.trace(task_name, job_id=job_id, task_id=task_dict.get("id")):
-                        # Convert dict to Task
-                        task = Task(**task_dict)
-                        # Create agent instance
-                        agent = agent_class(**(agent_config or {}))
-                        if auto_respond:
-                            agent.response_agent = ResponseAgent()
-                        # Run the task
-                        result = await agent.run(task, max_steps=max_steps)
-                        # Extract and print evaluation score for visibility
-                        reward = getattr(result, "reward", "N/A")
-                        logger.info(
-                            "[Worker %s] Task %s: ✓ Completed (reward: %s)",
-                            worker_id,
-                            index,
-                            reward,
-                        )
-                        logger.info(
-                            "[Worker %s] Completed task %s (reward: %s)",
-                            worker_id,
-                            index,
-                            reward,
-                        )
-                        return (index, result)
+                    with hud.trace(
+                        task_name, job_id=job_id, task_id=task_dict.get("id")
+                    ):
+                        try:
+                            # Convert dict to Task
+                            task = Task(**task_dict)
+                            # Create agent instance
+                            agent = agent_class(**(agent_config or {}))
+                            if auto_respond:
+                                agent.response_agent = ResponseAgent()
+                            # Run the task - this should ALWAYS return a result, even on error
+                            result = await agent.run(task, max_steps=max_steps)
+                            # Extract and print evaluation score for visibility
+                            reward = getattr(result, "reward", "N/A")
+                            logger.info(
+                                "[Worker %s] Task %s: ✓ Completed (reward: %s)",
+                                worker_id,
+                                index,
+                                reward,
+                            )
+                            logger.info(
+                                "[Worker %s] Completed task %s (reward: %s)",
+                                worker_id,
+                                index,
+                                reward,
+                            )
+                            return (index, result)
+                        except Exception as e:
+                            # Even if there's an exception, ensure we have a proper result
+                            logger.error(
+                                "[Worker %s] Task %s failed during execution: %s",
+                                worker_id,
+                                index,
+                                str(e)[:200],
+                            )
+                            # Create a proper Trace result for errors
+                            from hud.types import Trace
+                            error_result = Trace(
+                                reward=0.0,
+                                done=True,
+                                content=f"Task execution failed: {e}",
+                                isError=True,
+                                info={"error": str(e), "traceback": traceback.format_exc()},
+                            )
+                            return (index, error_result)
                 except Exception as e:
                     error_msg = f"Worker {worker_id}: Task {index} failed: {e}"
@@ -190,22 +212,6 @@ def _process_worker(
     try:
         # Run the async batch processing
         results = loop.run_until_complete(process_batch())
-        # CRITICAL: Ensure telemetry is fully sent before process exits
-        # Two things need to complete:
-        # 1. The trace context's __exit__ already called _update_task_status_sync (blocking)
-        # 2. But spans are buffered in BatchSpanProcessor and need explicit flush
-        from opentelemetry import trace as otel_trace
-        provider = otel_trace.get_tracer_provider()
-        if provider and hasattr(provider, "force_flush"):
-            # This forces BatchSpanProcessor to export all buffered spans NOW
-            # The method returns True if successful, False if timeout
-            success = provider.force_flush(timeout_millis=5000)  # 5 second timeout # type: ignore
-            if not success:
-                logger.warning("Worker %s: Telemetry flush timed out", worker_id)
         return results
     except KeyboardInterrupt:
         logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
@@ -230,6 +236,25 @@ def _process_worker(
         logger.error("Worker %s batch processing failed: %s", worker_id, e)
         return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
     finally:
+        # CRITICAL: Always ensure telemetry is fully sent before process exits
+        # This must happen in finally block to ensure it runs even on errors
+        try:
+            from opentelemetry import trace as otel_trace
+            provider = otel_trace.get_tracer_provider()
+            if provider and hasattr(provider, "force_flush"):
+                # This forces BatchSpanProcessor to export all buffered spans NOW
+                # The method returns True if successful, False if timeout
+                success = provider.force_flush(
+                    timeout_millis=10000
+                )  # 10 second timeout # type: ignore
+                if not success:
+                    logger.warning("Worker %s: Telemetry flush timed out", worker_id)
+                else:
+                    logger.debug("Worker %s: Telemetry flushed successfully", worker_id)
+        except Exception as flush_error:
+            logger.error("Worker %s: Failed to flush telemetry: %s", worker_id, flush_error)
         # Clean up the event loop
         try:
             loop.close()

{hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_version.py RENAMED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.23"
+    assert hud.__version__ == "0.4.24"

{hud_python-0.4.23 → hud_python-0.4.24}/hud/version.py RENAMED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.23"
+__version__ = "0.4.24"

{hud_python-0.4.23 → hud_python-0.4.24}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.4.23"
+version = "0.4.24"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.14"