PyPI - hud-python - Versions diffs - 0.4.50__py3-none-any.whl → 0.4.52__py3-none-any.whl - Mend

hud-python 0.4.50py3-none-any.whl → 0.4.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (37) hide show

hud/__init__.py +13 -1
hud/agents/base.py +5 -1
hud/agents/lite_llm.py +1 -1
hud/agents/tests/test_base.py +8 -16
hud/cli/__init__.py +12 -22
hud/cli/eval.py +53 -84
hud/cli/tests/test_build.py +2 -1
hud/cli/tests/test_eval.py +4 -0
hud/cli/tests/test_mcp_server.py +1 -1
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/version_check.py +257 -0
hud/clients/base.py +1 -1
hud/clients/mcp_use.py +3 -1
hud/datasets/parallel.py +2 -2
hud/datasets/runner.py +85 -24
hud/otel/config.py +8 -6
hud/otel/context.py +4 -4
hud/otel/exporters.py +231 -57
hud/rl/learner.py +1 -1
hud/server/router.py +1 -1
hud/shared/exceptions.py +0 -5
hud/shared/tests/test_exceptions.py +17 -16
hud/telemetry/__init__.py +30 -6
hud/telemetry/async_context.py +331 -0
hud/telemetry/job.py +51 -12
hud/telemetry/tests/test_trace.py +4 -4
hud/telemetry/trace.py +16 -17
hud/tools/computer/qwen.py +4 -1
hud/tools/executors/base.py +4 -2
hud/utils/task_tracking.py +223 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.4.50.dist-info → hud_python-0.4.52.dist-info}/METADATA +2 -1
{hud_python-0.4.50.dist-info → hud_python-0.4.52.dist-info}/RECORD +37 -34
{hud_python-0.4.50.dist-info → hud_python-0.4.52.dist-info}/WHEEL +0 -0
{hud_python-0.4.50.dist-info → hud_python-0.4.52.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.50.dist-info → hud_python-0.4.52.dist-info}/licenses/LICENSE +0 -0

hud/otel/exporters.py CHANGED Viewed

@@ -1,21 +1,27 @@
-"""Custom OpenTelemetry exporter that sends spans to the existing HUD telemetry
-HTTP endpoint (/trace/<id>/telemetry-upload).
+"""Custom OpenTelemetry exporter for HUD telemetry backend.
-The exporter groups spans by ``hud.task_run_id`` baggage / attribute so we keep
-exactly the same semantics the old async worker in ``hud.telemetry.exporter``
-implemented.
+This exporter sends spans to the HUD telemetry HTTP endpoint, grouping them
+by task_run_id for efficient batch uploads.
-This exporter is *synchronous* (derives from :class:`SpanExporter`).  We rely on
-``hud.shared.make_request_sync`` which already contains retry & auth logic.
+Performance optimizations:
+- Detects async contexts and runs exports in a thread pool to avoid blocking
+- Uses persistent HTTP client with connection pooling for reduced overhead
+- Tracks pending export futures to ensure completion during shutdown
+The exporter derives from SpanExporter (synchronous interface) but handles
+async contexts intelligently to prevent event loop blocking during high-concurrency
+workloads.
 """
 from __future__ import annotations
+import atexit
+import concurrent.futures as cf
 import contextlib
 import json
 import logging
-import time
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 from datetime import UTC, datetime
 from typing import TYPE_CHECKING, Any
@@ -31,6 +37,34 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
+# Global singleton thread pool for span exports
+_export_executor: ThreadPoolExecutor | None = None
+def get_export_executor() -> ThreadPoolExecutor:
+    """Get or create the global thread pool for span exports.
+    Returns a singleton ThreadPoolExecutor used for running span exports
+    in a thread pool when called from async contexts, preventing event
+    loop blocking during high-concurrency workloads.
+    The executor is automatically cleaned up on process exit via atexit.
+    Returns:
+        ThreadPoolExecutor with 8 workers for high-throughput parallel uploads
+    """
+    global _export_executor
+    if _export_executor is None:
+        # Use 8 workers to handle high-volume parallel uploads efficiently
+        _export_executor = ThreadPoolExecutor(max_workers=8, thread_name_prefix="span-export")
+        def cleanup() -> None:
+            if _export_executor is not None:
+                _export_executor.shutdown(wait=True)
+        atexit.register(cleanup)
+    return _export_executor
 # ---------------------------------------------------------------------------
 # Models
@@ -297,73 +331,213 @@ def _span_to_dict(span: ReadableSpan) -> dict[str, Any]:
 class HudSpanExporter(SpanExporter):
-    """Exporter that forwards spans to HUD backend using existing endpoint."""
+    """OpenTelemetry span exporter for the HUD backend.
+    This exporter groups spans by task_run_id and sends them to the HUD
+    telemetry endpoint. Performance optimizations include:
+    - Auto-detects async contexts and runs exports in thread pool (non-blocking)
+    - Tracks pending export futures for proper shutdown coordination
+    Handles high-concurrency scenarios (200+ parallel tasks) by offloading
+    synchronous HTTP operations to a thread pool when called from async
+    contexts, preventing event loop blocking.
+    """
     def __init__(self, *, telemetry_url: str, api_key: str) -> None:
+        """Initialize the HUD span exporter.
+        Args:
+            telemetry_url: Base URL for the HUD telemetry backend
+            api_key: API key for authentication
+        """
         super().__init__()
         self._telemetry_url = telemetry_url.rstrip("/")
         self._api_key = api_key
-    # ------------------------------------------------------------------
-    # Core API
-    # ------------------------------------------------------------------
+        # Track pending export futures for shutdown coordination
+        self._pending_futures: list[cf.Future[SpanExportResult]] = []
     def export(self, spans: list[ReadableSpan]) -> SpanExportResult:  # type: ignore[override]
+        """Export spans to HUD backend.
+        Auto-detects async contexts: if called from an async event loop, runs
+        the export in a thread pool to avoid blocking. Otherwise runs synchronously.
+        Args:
+            spans: List of ReadableSpan objects to export
+        Returns:
+            SpanExportResult.SUCCESS (returns immediately in async contexts)
+        """
         if not spans:
             return SpanExportResult.SUCCESS
-        # Group spans by hud.task_run_id attribute
+        # Group spans by task_run_id for batched uploads
         grouped: dict[str, list[ReadableSpan]] = defaultdict(list)
         for span in spans:
             run_id = span.attributes.get("hud.task_run_id") if span.attributes else None
             if not run_id:
-                # Skip spans that are outside HUD traces
+                # Skip spans outside HUD traces
                 continue
             grouped[str(run_id)].append(span)
-        # Send each group synchronously (retry inside make_request_sync)
-        for run_id, span_batch in grouped.items():
-            try:
-                url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
-                telemetry_spans = [_span_to_dict(s) for s in span_batch]
-                # Include current step count in metadata
-                metadata = {}
-                # Get the HIGHEST step count from the batch (most recent)
-                step_count = 0
-                for span in span_batch:
-                    if span.attributes and "hud.step_count" in span.attributes:
-                        current_step = span.attributes["hud.step_count"]
-                        if isinstance(current_step, int) and current_step > step_count:
-                            step_count = current_step
-                payload = {
-                    "metadata": metadata,
-                    "telemetry": telemetry_spans,
-                }
-                # Only include step_count if we found any steps
-                if step_count > 0:
-                    payload["step_count"] = step_count
-                logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
-                make_request_sync(
-                    method="POST",
-                    url=url,
-                    json=payload,
-                    api_key=self._api_key,
-                )
-            except Exception as exc:
-                logger.exception("HUD exporter failed to send spans for task %s: %s", run_id, exc)
-                # If *any* group fails we return FAILURE so the OTEL SDK can retry
-                return SpanExportResult.FAILURE
-        return SpanExportResult.SUCCESS
+        # Detect async context to avoid event loop blocking
+        import asyncio
+        try:
+            loop = asyncio.get_running_loop()
+            # In async context - offload to thread pool
+            executor = get_export_executor()
+            def _sync_export() -> SpanExportResult:
+                # Send each group synchronously (retry inside make_request_sync)
+                for run_id, span_batch in grouped.items():
+                    try:
+                        url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
+                        telemetry_spans = [_span_to_dict(s) for s in span_batch]
+                        # Include current step count in metadata
+                        metadata = {}
+                        # Get the HIGHEST step count from the batch (most recent)
+                        step_count = 0
+                        for span in span_batch:
+                            if span.attributes and "hud.step_count" in span.attributes:
+                                current_step = span.attributes["hud.step_count"]
+                                if isinstance(current_step, int) and current_step > step_count:
+                                    step_count = current_step
+                        payload = {
+                            "metadata": metadata,
+                            "telemetry": telemetry_spans,
+                        }
+                        # Only include step_count if we found any steps
+                        if step_count > 0:
+                            payload["step_count"] = step_count
+                        logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
+                        make_request_sync(
+                            method="POST",
+                            url=url,
+                            json=payload,
+                            api_key=self._api_key,
+                        )
+                    except Exception as exc:
+                        logger.exception(
+                            "HUD exporter failed to send spans for task %s: %s", run_id, exc
+                        )
+                        return SpanExportResult.FAILURE
+                return SpanExportResult.SUCCESS
+            # Run in thread to avoid blocking event loop
+            future = loop.run_in_executor(executor, _sync_export)
+            # Track and cleanup when done
+            self._pending_futures.append(future)  # type: ignore[list-item]
+            def _cleanup_done(f: cf.Future[SpanExportResult]) -> None:
+                with contextlib.suppress(Exception):
+                    # Consume exception to avoid "exception was never retrieved"
+                    _ = f.exception()
+                # Remove from pending list
+                with contextlib.suppress(ValueError):
+                    self._pending_futures.remove(f)
+            future.add_done_callback(_cleanup_done)  # type: ignore[arg-type]
+            # Don't wait for it - return immediately
+            return SpanExportResult.SUCCESS
+        except RuntimeError:
+            # No event loop - run synchronously
+            # Send each group synchronously (retry inside make_request_sync)
+            for run_id, span_batch in grouped.items():
+                try:
+                    url = f"{self._telemetry_url}/trace/{run_id}/telemetry-upload"
+                    telemetry_spans = [_span_to_dict(s) for s in span_batch]
+                    # Include current step count in metadata
+                    metadata = {}
+                    # Get the HIGHEST step count from the batch (most recent)
+                    step_count = 0
+                    for span in span_batch:
+                        if span.attributes and "hud.step_count" in span.attributes:
+                            current_step = span.attributes["hud.step_count"]
+                            if isinstance(current_step, int) and current_step > step_count:
+                                step_count = current_step
+                    payload = {
+                        "metadata": metadata,
+                        "telemetry": telemetry_spans,
+                    }
+                    # Only include step_count if we found any steps
+                    if step_count > 0:
+                        payload["step_count"] = step_count
+                    logger.debug("HUD exporter sending %d spans to %s", len(span_batch), url)
+                    make_request_sync(
+                        method="POST",
+                        url=url,
+                        json=payload,
+                        api_key=self._api_key,
+                    )
+                except Exception as exc:
+                    logger.exception(
+                        "HUD exporter failed to send spans for task %s: %s", run_id, exc
+                    )
+                    # If *any* group fails we return FAILURE so the OTEL SDK can retry
+                    return SpanExportResult.FAILURE
+            return SpanExportResult.SUCCESS
     def shutdown(self) -> None:  # type: ignore[override]
-        # Nothing to cleanup, httpx handled inside make_request_sync
-        pass
+        """Shutdown the exporter and wait for pending exports.
+        Waits up to 10 seconds for any in-flight exports to complete.
+        """
+        try:
+            if self._pending_futures:
+                with contextlib.suppress(Exception):
+                    cf.wait(self._pending_futures, timeout=10.0)
+        finally:
+            self._pending_futures.clear()
     def force_flush(self, timeout_millis: int | None = None) -> bool:  # type: ignore[override]
-        if timeout_millis:
-            time.sleep(timeout_millis / 1000)
-        # Synchronous export, nothing buffered here
-        return True
+        """Force flush all pending span exports.
+        Waits for all pending export futures to complete before returning.
+        This is called by the OpenTelemetry SDK during shutdown to ensure
+        all telemetry is uploaded.
+        Args:
+            timeout_millis: Maximum time to wait in milliseconds
+        Returns:
+            True if all exports completed, False otherwise
+        """
+        try:
+            if not self._pending_futures:
+                return True
+            total_pending = len(self._pending_futures)
+            if total_pending > 10:
+                # Show progress for large batches
+                logger.info("Flushing %d pending telemetry uploads...", total_pending)
+            timeout = (timeout_millis or 30000) / 1000.0
+            done, not_done = cf.wait(self._pending_futures, timeout=timeout)
+            # Consume exceptions to avoid "exception was never retrieved" warnings
+            for f in list(done):
+                with contextlib.suppress(Exception):
+                    _ = f.exception()
+            # Remove completed futures
+            for f in list(done):
+                with contextlib.suppress(ValueError):
+                    self._pending_futures.remove(f)
+            if total_pending > 10:
+                logger.info("Completed %d/%d telemetry uploads", len(done), total_pending)
+            return len(not_done) == 0
+        except Exception:
+            return False

hud/rl/learner.py CHANGED Viewed

@@ -187,7 +187,7 @@ class GRPOLearner:
         # Use 8-bit optimizer if configured
         if self.config.training.use_8bit_optimizer and BNB_AVAILABLE:
             hud_console.info("Using 8-bit AdamW optimizer from bitsandbytes")
-            optimizer = bnb.optim.AdamW8bit(
+            optimizer = bnb.optim.AdamW8bit(  # type: ignore
                 trainable_params,
                 lr=self.config.training.lr,
                 betas=self.config.training.adam_betas,

hud/server/router.py CHANGED Viewed

@@ -5,7 +5,7 @@ from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING, Any
-from hud.server import MCPServer
+from hud.server.server import MCPServer
 if TYPE_CHECKING:
     from collections.abc import Callable

hud/shared/exceptions.py CHANGED Viewed

@@ -69,11 +69,6 @@ class HudException(Exception):
             elif isinstance(exc_value, Exception):
                 # Try to convert to a specific HudException
                 result = cls._analyze_exception(exc_value, message or str(exc_value))
-                # If we couldn't categorize it (still base HudException),
-                # just re-raise the original exception
-                if type(result) is HudException:
-                    # Re-raise the original exception unchanged
-                    raise exc_value from None
                 return result
         # Normal creation

hud/shared/tests/test_exceptions.py CHANGED Viewed

@@ -7,7 +7,7 @@ classification and helpful hints for users.
 from __future__ import annotations
 import json
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 import httpx
 import pytest
@@ -17,6 +17,7 @@ from hud.shared.exceptions import (
     HudClientError,
     HudConfigError,
     HudException,
+    HudMCPError,
     HudRateLimitError,
     HudRequestError,
     HudTimeoutError,
@@ -243,23 +244,23 @@ class TestMCPErrorHandling:
     @pytest.mark.asyncio
     async def test_mcp_error_handling(self):
         """Test that McpError is handled appropriately."""
-        # Since McpError is imported dynamically, we'll mock it
-        with patch("hud.clients.mcp_use.McpError") as MockMcpError:
-            MockMcpError.side_effect = Exception
-            # Create a mock MCP error
-            mcp_error = Exception("MCP protocol error: Unknown method")
-            mcp_error.__class__.__name__ = "McpError"
+        # Create a mock McpError class
+        class McpError(Exception):
+            pass
-            try:
-                raise mcp_error
-            except Exception as e:
-                # This would typically be caught in the client code
-                # and re-raised as HudException
-                with pytest.raises(HudException) as exc_info:
-                    raise HudException from e
+        # Create a mock MCP error
+        mcp_error = McpError("MCP protocol error: Unknown method")
+        try:
+            raise mcp_error
+        except Exception as e:
+            # This would typically be caught in the client code
+            # and re-raised as HudException
+            with pytest.raises(HudMCPError) as exc_info:
+                raise HudException from e
-                assert "MCP protocol error" in str(exc_info.value)
+            assert "MCP protocol error" in str(exc_info.value)
     def test_mcp_tool_error_result(self):
         """Test handling of MCP tool execution errors (isError: true)."""
@@ -352,7 +353,7 @@ class TestExceptionRendering:
         assert len(error.hints) == 1
         assert error.hints[0] == HUD_API_KEY_MISSING
         assert error.hints[0].title == "HUD API key required"
-        assert "Set HUD_API_KEY environment variable" in error.hints[0].tips[0]
+        assert "Set HUD_API_KEY" in error.hints[0].tips[0]
     def test_exception_type_preservation(self):
         """Test that exception types are preserved through conversion."""

hud/telemetry/__init__.py CHANGED Viewed

@@ -1,14 +1,36 @@
-"""HUD Telemetry - User-facing APIs for tracing and job management.
+"""HUD Telemetry - Tracing and job management for agent execution.
-This module provides the main telemetry APIs that users interact with:
-- trace: Context manager for tracing code execution
-- job: Context manager and utilities for job management
-- instrument: Decorator for instrumenting functions
-- get_trace: Retrieve collected traces for replay/analysis
+Provides telemetry APIs for tracking agent execution and experiments.
+Standard Usage:
+    >>> import hud
+    >>> with hud.trace("My Task"):
+    ...     do_work()
+    >>> with hud.job("My Job") as job:
+    ...     with hud.trace("Task", job_id=job.id):
+    ...         do_work()
+High-Concurrency Usage (200+ parallel tasks):
+    >>> import hud
+    >>> async with hud.async_job("Evaluation") as job:
+    ...     async with hud.async_trace("Task", job_id=job.id):
+    ...         await do_async_work()
+APIs:
+    - trace(), job() - Standard context managers (for typical usage)
+    - async_trace(), async_job() - Async context managers (for high concurrency)
+    - instrument() - Decorator for instrumenting functions
+    - get_trace() - Retrieve collected traces for replay
+Note:
+    Use async_trace/async_job only for high-concurrency scenarios (200+ tasks).
+    The run_dataset() function uses them automatically.
 """
 from __future__ import annotations
+from .async_context import async_job, async_trace
 from .instrument import instrument
 from .job import Job, create_job, job
 from .replay import clear_trace, get_trace
@@ -17,6 +39,8 @@ from .trace import Trace, trace
 __all__ = [
     "Job",
     "Trace",
+    "async_job",
+    "async_trace",
     "clear_trace",
     "create_job",
     "get_trace",

hud-python 0.4.50__py3-none-any.whl → 0.4.52__py3-none-any.whl

Potentially problematic release.

hud-python 0.4.50py3-none-any.whl → 0.4.52py3-none-any.whl