PyPI - hud-python - Versions diffs - 0.4.17__tar.gz → 0.4.18__tar.gz - Mend

hud-python 0.4.17tar.gz → 0.4.18tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (183) hide show

{hud_python-0.4.17 → hud_python-0.4.18}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.17
+Version: 0.4.18
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.17 → hud_python-0.4.18}/hud/agents/misc/response_agent.py RENAMED Viewed

@@ -54,7 +54,7 @@ class ResponseAgent:
         """
         try:
             response = await self.client.chat.completions.create(
-                model="gpt-4o",
+                model="gpt-5-nano",
                 messages=[
                     {"role": "system", "content": self.system_prompt},
                     {

hud_python-0.4.18/hud/agents/openai_chat_generic.py ADDED Viewed

@@ -0,0 +1,288 @@
+"""Generic OpenAI chat-completions agent.
+This class provides the minimal glue required to connect any endpoint that
+implements the OpenAI compatible *chat.completions* API with MCP tool calling
+through the existing :class:`hud.agent.MCPAgent` scaffolding.
+Key points:
+- Stateless, no special server-side conversation state is assumed.
+- Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
+  base_url / api_key (e.g. ART, llama.cpp, together.ai, …)
+- All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
+  come from the ``MCPAgent`` base class, we only implement the three abstract
+  methods
+"""
+from __future__ import annotations
+import json
+import logging
+from typing import TYPE_CHECKING, Any, cast
+import mcp.types as types
+from hud import instrument
+from hud.types import AgentResponse, MCPToolCall, MCPToolResult
+from .base import MCPAgent
+if TYPE_CHECKING:
+    from openai import AsyncOpenAI
+    from openai.types.chat import ChatCompletionToolParam
+    from hud.clients import AgentMCPClient
+logger = logging.getLogger(__name__)
+class GenericOpenAIChatAgent(MCPAgent):
+    """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
+    def __init__(
+        self,
+        mcp_client: AgentMCPClient,
+        *,
+        openai_client: AsyncOpenAI,
+        model_name: str = "gpt-4o-mini",
+        parallel_tool_calls: bool = False,
+        logprobs: bool = False,
+        **agent_kwargs: Any,
+    ) -> None:
+        super().__init__(mcp_client=mcp_client, **agent_kwargs)
+        self.oai = openai_client
+        self.model_name = model_name
+        self.parallel_tool_calls = parallel_tool_calls
+        self.logprobs = logprobs
+        self.conversation_history = []
+    @staticmethod
+    def _oai_to_mcp(tool_call: Any) -> MCPToolCall:  # type: ignore[valid-type]
+        """Convert an OpenAI ``tool_call`` to :class:`MCPToolCall`."""
+        return MCPToolCall(
+            id=tool_call.id,
+            name=tool_call.function.name,
+            arguments=json.loads(tool_call.function.arguments or "{}"),
+        )
+    async def get_system_messages(self) -> list[Any]:
+        """Get system messages for OpenAI."""
+        return [{"role": "system", "content": self.system_prompt}]
+    async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
+        """Format blocks for OpenAI."""
+        content = []
+        for block in blocks:
+            if isinstance(block, types.TextContent):
+                content.append({"type": "text", "text": block.text})
+            elif isinstance(block, types.ImageContent):
+                content.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:{block.mimeType};base64,{block.data}"},
+                    }
+                )
+        return [{"role": "user", "content": content}]
+    def _sanitize_schema_for_openai(self, schema: dict) -> dict:
+        """Convert MCP JSON Schema to OpenAI-compatible format.
+        Handles unsupported features like anyOf and prefixItems.
+        """
+        if not isinstance(schema, dict):
+            return schema
+        sanitized = {}
+        for key, value in schema.items():
+            if key == "anyOf" and isinstance(value, list):
+                # Handle anyOf patterns (usually for nullable fields)
+                non_null_types = [
+                    v for v in value if not (isinstance(v, dict) and v.get("type") == "null")
+                ]
+                if non_null_types:
+                    # Use the first non-null type
+                    sanitized.update(self._sanitize_schema_for_openai(non_null_types[0]))
+                else:
+                    sanitized["type"] = "string"  # Fallback
+            elif key == "prefixItems":
+                # Convert prefixItems to simple items
+                sanitized["type"] = "array"
+                if isinstance(value, list) and value:
+                    # Use the type from the first item as the items schema
+                    first_item = value[0]
+                    if isinstance(first_item, dict):
+                        sanitized["items"] = {"type": first_item.get("type", "string")}
+                    else:
+                        sanitized["items"] = {"type": "string"}
+            elif key == "properties" and isinstance(value, dict):
+                # Recursively sanitize property schemas
+                sanitized[key] = {
+                    prop_name: self._sanitize_schema_for_openai(prop_schema)
+                    for prop_name, prop_schema in value.items()
+                }
+            elif key == "items" and isinstance(value, dict):
+                # Recursively sanitize items schema
+                sanitized[key] = self._sanitize_schema_for_openai(value)
+            elif key in (
+                "type",
+                "description",
+                "enum",
+                "required",
+                "default",
+                "minimum",
+                "maximum",
+                "minItems",
+                "maxItems",
+            ):
+                # These are supported by OpenAI
+                sanitized[key] = value
+        return sanitized or {"type": "object"}
+    def get_tool_schemas(self) -> list[dict]:
+        tool_schemas = super().get_tool_schemas()
+        openai_tools = []
+        for schema in tool_schemas:
+            parameters = schema.get("parameters", {})
+            if parameters:
+                sanitized_params = self._sanitize_schema_for_openai(parameters)
+            else:
+                sanitized_params = {"type": "object", "properties": {}}
+            openai_tool = {
+                "type": "function",
+                "function": {
+                    "name": schema["name"],
+                    "description": schema.get("description", ""),
+                    "parameters": sanitized_params,
+                },
+            }
+            openai_tools.append(openai_tool)
+        return openai_tools
+    @instrument(
+        span_type="agent",
+        record_args=False,
+        record_result=True,
+    )
+    async def get_response(self, messages: list[Any]) -> AgentResponse:
+        """Send chat request to OpenAI and convert the response."""
+        # Convert MCP tool schemas to OpenAI format
+        mcp_schemas = self.get_tool_schemas()
+        response = await self.oai.chat.completions.create(
+            model=self.model_name,
+            messages=messages,
+            tools=cast("list[ChatCompletionToolParam]", mcp_schemas),
+            parallel_tool_calls=self.parallel_tool_calls,
+            logprobs=self.logprobs,
+        )
+        choice = response.choices[0]
+        msg = choice.message
+        assistant_msg: dict[str, Any] = {"role": "assistant"}
+        if msg.content:
+            assistant_msg["content"] = msg.content
+        if msg.tool_calls:
+            assistant_msg["tool_calls"] = msg.tool_calls
+        messages.append(assistant_msg)
+        # Store the complete conversation history
+        self.conversation_history = messages.copy()
+        tool_calls = []
+        if msg.tool_calls:
+            for tc in msg.tool_calls:
+                if tc.function.name is not None:  # type: ignore
+                    tool_calls.append(self._oai_to_mcp(tc))
+                    if not self.parallel_tool_calls:
+                        break
+        return AgentResponse(
+            content=msg.content or "",
+            tool_calls=tool_calls,
+            done=choice.finish_reason in ("stop", "length"),
+            raw=response,  # Include raw response for access to Choice objects
+        )
+    async def format_tool_results(
+        self,
+        tool_calls: list[MCPToolCall],
+        tool_results: list[MCPToolResult],
+    ) -> list[Any]:
+        """Render MCP tool results as OpenAI messages.
+        Note: OpenAI tool messages only support string content.
+        When images are present, we return both a tool message and a user message.
+        """
+        rendered: list[dict[str, Any]] = []
+        for call, res in zip(tool_calls, tool_results, strict=False):
+            # Use structuredContent.result if available, otherwise use content
+            items = res.content
+            if res.structuredContent and isinstance(res.structuredContent, dict):
+                items = res.structuredContent.get("result", res.content)
+            # Separate text and image content
+            text_parts = []
+            image_parts = []
+            for item in items:
+                if isinstance(item, dict):
+                    if item.get("type") == "text":
+                        text_parts.append(item.get("text", ""))
+                    elif item.get("type") == "image":
+                        mime_type = item.get("mimeType", "image/png")
+                        data = item.get("data", "")
+                        image_parts.append(
+                            {
+                                "type": "image_url",
+                                "image_url": {
+                                    "url": f"data:{mime_type};base64,{data}"
+                                },
+                            }
+                        )
+                elif isinstance(item, types.TextContent):
+                    text_parts.append(item.text)
+                elif isinstance(item, types.ImageContent):
+                    image_parts.append(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:{item.mimeType};base64,{item.data}"},
+                        }
+                    )
+            text_content = "".join(text_parts) if text_parts else "Tool executed successfully"
+            rendered.append(
+                {
+                    "role": "tool",
+                    "tool_call_id": call.id,
+                    "content": text_content,
+                }
+            )
+            # If there are images, add them as a separate user message
+            if image_parts:
+                # Add a user message with the images
+                content_with_images = [
+                    {"type": "text", "text": "Tool returned the following:"},
+                    *image_parts
+                ]
+                rendered.append(
+                    {
+                        "role": "user",
+                        "content": content_with_images,
+                    }
+                )
+        return rendered

{hud_python-0.4.17 → hud_python-0.4.18}/hud/datasets/execution/parallel.py RENAMED Viewed

@@ -40,6 +40,7 @@ def _process_worker(
     2. Creates its own event loop
     3. Processes a batch of tasks asynchronously
     4. Returns results with their original indices
+    5. Handles interruption signals gracefully
     Args:
         task_batch: List of (index, task_dict) tuples
@@ -58,6 +59,7 @@ def _process_worker(
         List of (index, result) tuples
     """
     # Import inside worker to avoid pickling issues
+    import signal
     import sys
     import hud
@@ -72,6 +74,14 @@ def _process_worker(
     except AttributeError:
         pass
+    # Set up signal handler for clean interruption
+    def signal_handler(signum: int, frame: Any) -> None:
+        logger.warning("Worker %s: Received interrupt signal", worker_id)
+        # Raise KeyboardInterrupt to actually interrupt the worker
+        raise KeyboardInterrupt(f"Worker {worker_id} interrupted by user")
+    signal.signal(signal.SIGINT, signal_handler)
     # Reinitialize telemetry in this process
     configure_telemetry()
@@ -157,8 +167,25 @@ def _process_worker(
         # Process all tasks in parallel within this process
         tasks = [process_single_task(idx, task_dict) for idx, task_dict in task_batch]
-        results = await asyncio.gather(*tasks, return_exceptions=False)
-        return results
+        try:
+            results = await asyncio.gather(*tasks, return_exceptions=False)
+            return results
+        except asyncio.CancelledError:
+            logger.info("Worker %s: Tasks cancelled due to interruption", worker_id)
+            # Return error results for all tasks
+            return [
+                (
+                    idx,
+                    {
+                        "error": "Task cancelled (Ctrl+C)",
+                        "isError": True,
+                        "reward": 0.0,
+                        "done": False,
+                        "content": "Task cancelled",
+                    },
+                )
+                for idx, _ in task_batch
+            ]
     try:
         # Run the async batch processing
@@ -180,6 +207,24 @@ def _process_worker(
                 logger.warning("Worker %s: Telemetry flush timed out", worker_id)
         return results
+    except KeyboardInterrupt:
+        logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
+        # Return partial results for tasks that completed
+        partial_results = []
+        for idx, _ in task_batch:
+            partial_results.append(
+                (
+                    idx,
+                    {
+                        "error": "Worker interrupted by user (Ctrl+C)",
+                        "isError": True,
+                        "reward": 0.0,
+                        "done": False,
+                        "content": "Task interrupted",
+                    },
+                )
+            )
+        return partial_results
     except Exception as e:
         logger.error("[Worker %s] Batch processing failed: %s", worker_id, e)
         logger.error("Worker %s batch processing failed: %s", worker_id, e)
@@ -365,7 +410,8 @@ async def run_dataset_parallel_manual(
         )
         # Process batches in parallel using ProcessPoolExecutor
-        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+        executor = ProcessPoolExecutor(max_workers=max_workers)
+        try:
             # Submit all batches to workers
             future_to_batch = {
                 executor.submit(worker_func, batch, worker_id=i): batch
@@ -377,48 +423,78 @@ async def run_dataset_parallel_manual(
             total = len(task_dicts)
             # Process results as they complete
-            for future in as_completed(future_to_batch):
-                batch = future_to_batch[future]
-                try:
-                    # Get results from this worker
-                    batch_results = future.result()
-                    # Place results in correct positions
-                    for index, result in batch_results:
-                        results[index] = result
-                        completed += 1
-                    # Calculate success rate so far
-                    successful_so_far = sum(
-                        1
-                        for r in results[:completed]
-                        if r is not None and getattr(r, "reward", 0) > 0
-                    )
+            try:
+                for future in as_completed(future_to_batch):
+                    batch = future_to_batch[future]
+                    try:
+                        # Get results from this worker
+                        batch_results = future.result()
+                        # Place results in correct positions
+                        for index, result in batch_results:
+                            results[index] = result
+                            completed += 1
+                        # Calculate success rate so far
+                        successful_so_far = sum(
+                            1
+                            for r in results[:completed]
+                            if r is not None and getattr(r, "reward", 0) > 0
+                        )
-                    progress_msg = (
-                        f"Progress: {completed}/{total} tasks completed "
-                        f"({100 * completed / total:.1f}%) | "
-                        f"Success rate: {successful_so_far}/{completed} "
-                        f"({100 * successful_so_far / completed:.1f}%)"
-                    )
+                        progress_msg = (
+                            f"Progress: {completed}/{total} tasks completed "
+                            f"({100 * completed / total:.1f}%) | "
+                            f"Success rate: {successful_so_far}/{completed} "
+                            f"({100 * successful_so_far / completed:.1f}%)"
+                        )
-                    logger.info(progress_msg)
+                        logger.info(progress_msg)
-                except Exception as e:
-                    # Handle worker failure
-                    logger.error("Worker failed with exception: %s\n%s", e, traceback.format_exc())
+                    except Exception as e:
+                        # Handle worker failure
+                        logger.error(
+                            "Worker failed with exception: %s\n%s", e, traceback.format_exc()
+                        )
-                    # Mark all tasks in this batch as failed
-                    for index, _ in batch:
-                        results[index] = {
-                            "error": f"Worker process failed: {e}",
+                        # Mark all tasks in this batch as failed
+                        for index, _ in batch:
+                            results[index] = {
+                                "error": f"Worker process failed: {e}",
+                                "isError": True,
+                                "reward": 0.0,
+                                "done": False,
+                                "content": f"Worker process failed: {e}",
+                            }
+                            completed += 1
+            except KeyboardInterrupt:
+                logger.warning("\n⚠️  Parallel evaluation interrupted by user (Ctrl+C)")
+                logger.info("Cancelling pending tasks...")
+                # Cancel all pending futures
+                for future in future_to_batch:
+                    if not future.done():
+                        future.cancel()
+                # Mark uncompleted tasks as interrupted
+                for i, r in enumerate(results):
+                    if r is None:
+                        results[i] = {
+                            "error": "Evaluation interrupted by user",
                             "isError": True,
                             "reward": 0.0,
                             "done": False,
-                            "content": f"Worker process failed: {e}",
+                            "content": "Task interrupted (Ctrl+C)",
                         }
-                        completed += 1
+                logger.info("Interrupted after %s/%s tasks", completed, total)
+                raise  # Re-raise to propagate the interrupt
+        finally:
+            # Always shutdown the executor properly
+            executor.shutdown(wait=False, cancel_futures=True)
         # Verify all results are populated
         missing = [i for i, r in enumerate(results) if r is None]

{hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/exporters.py RENAMED Viewed

@@ -14,6 +14,7 @@ from __future__ import annotations
 import contextlib
 import json
 import logging
+import time
 from collections import defaultdict
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
@@ -362,5 +363,7 @@ class HudSpanExporter(SpanExporter):
         pass
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
+        if timeout_millis:
+            time.sleep(timeout_millis / 1000)
         # Synchronous export, nothing buffered here
         return True

{hud_python-0.4.17 → hud_python-0.4.18}/hud/otel/processors.py RENAMED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
 import logging
+import time
 from typing import Any
 from opentelemetry import baggage
@@ -115,4 +116,6 @@ class HudEnrichmentProcessor(SpanProcessor):
         pass
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
+        if timeout_millis:
+            time.sleep(timeout_millis / 1000)
         return True

{hud_python-0.4.17 → hud_python-0.4.18}/hud/utils/tests/test_version.py RENAMED Viewed

@@ -5,4 +5,4 @@ def test_import():
     """Test that the package can be imported."""
     import hud
-    assert hud.__version__ == "0.4.17"
+    assert hud.__version__ == "0.4.18"

{hud_python-0.4.17 → hud_python-0.4.18}/hud/version.py RENAMED Viewed

@@ -4,4 +4,4 @@ Version information for the HUD SDK.
 from __future__ import annotations
-__version__ = "0.4.17"
+__version__ = "0.4.18"

{hud_python-0.4.17 → hud_python-0.4.18}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "hud-python"
-version = "0.4.17"
+version = "0.4.18"
 description = "SDK for the HUD platform."
 readme = "README.md"
 requires-python = ">=3.11, <3.14"

hud-python 0.4.17__tar.gz → 0.4.18__tar.gz

Potentially problematic release.

hud-python 0.4.17tar.gz → 0.4.18tar.gz