PyPI - hud-python - Versions diffs - 0.4.59__tar.gz → 0.4.60__tar.gz - Mend

hud-python 0.4.59tar.gz → 0.4.60tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (311) hide show

{hud_python-0.4.59 → hud_python-0.4.60}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.4.59
+Version: 0.4.60
 Summary: SDK for the HUD platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-python
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues

{hud_python-0.4.59 → hud_python-0.4.60}/environments/README.md RENAMED Viewed

@@ -496,7 +496,7 @@ from hud.clients import MCPClient
 async def main():
     # `trace` captures *everything* that happens and sends it to hud.ai
-    with hud.trace("local_test"):
+    async with hud.async_trace("local_test"):
         task = Task(
             prompt="Complete the task",
             mcp_config={

{hud_python-0.4.59 → hud_python-0.4.60}/environments/browser/server/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ version = "0.1.0"
 description = "HUD Browser MCP Server"
 requires-python = ">=3.11,<3.14"
 dependencies = [
-    "hud-python>=0.4.59",
+    "hud-python>=0.4.60",
     "httpx",
     "playwright",
     "pyautogui",

{hud_python-0.4.59 → hud_python-0.4.60}/hud/agents/gemini.py RENAMED Viewed

@@ -461,7 +461,8 @@ class GeminiAgent(MCPAgent):
     def _remove_old_screenshots(self, messages: list[genai_types.Content]) -> None:
         """
         Remove screenshots from old turns to manage context length.
-        Keeps only the last N turns with screenshots (configured via self.max_recent_turn_with_screenshots).
+        Keeps only the last N turns with screenshots (configured via
+        self.max_recent_turn_with_screenshots).
         """
         turn_with_screenshots_found = 0

{hud_python-0.4.59 → hud_python-0.4.60}/hud/cli/eval.py RENAMED Viewed

@@ -260,9 +260,8 @@ async def run_single_task(
 ) -> None:
     """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
-    # Provide early feedback to user
     hud_console.info("🔧 Initializing evaluation...")
-    # Import Task and run_dataset lazily
     try:
         from hud.utils.tasks import load_tasks
     except ImportError as e:
@@ -399,23 +398,31 @@ async def run_single_task(
     if group_size > 1:
         hud_console.info(f"🔄 Running task with group_size={group_size}")
-        # Run with grouping
-        stats = await run_tasks_grouped(
-            tasks=[task],
-            agent_class=agent_class,
-            agent_config=agent_config,
-            group_size=group_size,
-            max_parallel_episodes=48,  # Same as RL default
-            max_steps=max_steps,
-            verbose=verbose,
-        )
+        async with hud.async_job(
+            name=f"Group Eval: {task_prompt[:50]}... (x{group_size})",
+            metadata={
+                "task_id": getattr(task, "id", None),
+                "group_size": group_size,
+                "total_episodes": group_size,
+            },
+        ) as job:
+            stats = await run_tasks_grouped(
+                tasks=[task],
+                agent_class=agent_class,
+                agent_config=agent_config,
+                group_size=group_size,
+                max_parallel_episodes=48,
+                max_steps=max_steps,
+                verbose=verbose,
+                job_id=job.id,
+            )
         display_group_statistics(stats, show_details=True)
     else:
         # Enable agent step logging for single task mode
         logging.getLogger("hud.agents").setLevel(logging.INFO)
         logging.getLogger("hud.agents.base").setLevel(logging.INFO)
-        with hud.trace(name=task_prompt):
+        async with hud.async_trace(name=task_prompt):
             agent = build_agent(
                 agent_type,
                 model=model,
@@ -442,10 +449,8 @@ async def run_full_dataset(
 ) -> list[Any]:
     """Run evaluation across the entire dataset using asyncio-based concurrency."""
-    # Provide early feedback to user
     hud_console.info("🔧 Initializing evaluation...")
-    # Import run_dataset lazily
     try:
         from hud.datasets import run_dataset
         from hud.utils.tasks import load_tasks
@@ -627,7 +632,7 @@ async def run_full_dataset(
         hud_console.info(f"🔄 Running dataset with group_size={group_size}")
         # Run with job tracking
-        with hud.job(
+        async with hud.async_job(
             name=f"Evaluation {dataset_name} (group_size={group_size})",
             metadata={
                 "dataset": source,

{hud_python-0.4.59 → hud_python-0.4.60}/hud/datasets/parallel.py RENAMED Viewed

@@ -371,7 +371,7 @@ async def run_dataset_parallel_manual(
             logger.warning("Failed to extract dataset verification info")
     # Create job context
-    with hud.job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
+    async with hud.async_job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
         # Prepare agent class info for pickling
         agent_module = agent_class.__module__
         agent_name = agent_class.__name__

{hud_python-0.4.59 → hud_python-0.4.60}/hud/datasets/runner.py RENAMED Viewed

@@ -30,20 +30,14 @@ async def run_dataset(
 ) -> list[Any]:
     """Run all tasks in a dataset with automatic job and telemetry tracking.
-    This function handles concurrent task execution with proper telemetry collection.
-    All tasks are executed in parallel up to `max_concurrent`, with full telemetry
-    automatically uploaded to the HUD platform.
     Args:
         name: Name for the job
         dataset: HuggingFace dataset identifier (e.g. "hud-evals/SheetBench-50"),
                 Dataset object, OR list of Task objects
         agent_class: Agent class to instantiate (e.g., ClaudeAgent)
-        agent_config: Configuration/kwargs for agent (model, etc.)
-        max_concurrent: Maximum parallel task execution. Higher values improve throughput
-                       but may increase memory usage. Recommended: 30-200 depending on
-                       task complexity and available resources.
-        metadata: Optional metadata for the job
+        agent_config: Configuration kwargs for agent initialization
+        max_concurrent: Maximum concurrent tasks (recommended: 50-200)
+        metadata: Optional job metadata
         max_steps: Maximum steps per task
         split: Dataset split to use when loading from string (default: "train")
         auto_respond: Whether to use auto-response agent
@@ -101,7 +95,6 @@ async def run_dataset(
         except Exception:
             logger.warning("Failed to extract dataset verification info")
-    # Use async job context manager for high-concurrency telemetry
     async with hud.async_job(name, metadata=job_metadata, dataset_link=dataset_link) as job_obj:
         # Run tasks with semaphore for concurrency control
         sem = asyncio.Semaphore(max_concurrent)
@@ -112,12 +105,10 @@ async def run_dataset(
                 try:
                     # Create trace for this task
                     task_name = task_dict.get("prompt") or f"Task {index}"
-                    # Ensure task_id is a string for baggage propagation
                     raw_task_id = task_dict.get("id")
                     safe_task_id = str(raw_task_id) if raw_task_id is not None else None
                     async with hud.async_trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
-                        # with hud.trace(task_name, job_id=job_obj.id, task_id=safe_task_id):
                         # Convert dict to Task here, at trace level
                         task = Task(**task_dict)
@@ -141,44 +132,4 @@ async def run_dataset(
             if isinstance(result, Exception):
                 logger.error("Worker %s failed with exception: %s", i, result, exc_info=result)
-    # Ensure all telemetry is uploaded before returning
-    await _flush_telemetry()
     return results
-async def _flush_telemetry() -> None:
-    """Flush all pending telemetry operations.
-    Ensures complete telemetry upload by:
-    1. Waiting for all async status updates to complete
-    2. Forcing OpenTelemetry span processor to export remaining spans
-    This prevents telemetry loss at high concurrency (200+ tasks) by ensuring
-    all operations complete before process exit.
-    """
-    from hud.otel.config import is_telemetry_configured
-    from hud.utils import hud_console
-    from hud.utils.task_tracking import wait_all_tasks
-    hud_console.info("Uploading telemetry...")
-    # Step 1: Wait for async status updates (job/trace status)
-    completed_tasks = await wait_all_tasks(timeout_seconds=20.0)
-    if completed_tasks > 0:
-        hud_console.info(f"Completed {completed_tasks} pending telemetry tasks")
-    # Step 2: Flush OpenTelemetry span exports
-    if is_telemetry_configured():
-        try:
-            from opentelemetry import trace
-            from opentelemetry.sdk.trace import TracerProvider
-            provider = trace.get_tracer_provider()
-            if isinstance(provider, TracerProvider):
-                provider.force_flush(timeout_millis=20000)
-                logger.debug("OpenTelemetry spans flushed successfully")
-        except Exception as e:
-            logger.warning("Failed to flush OpenTelemetry: %s", e)
-    hud_console.info("Telemetry uploaded successfully")

hud_python-0.4.60/hud/datasets/tests/test_runner.py ADDED Viewed

@@ -0,0 +1,67 @@
+from __future__ import annotations
+from unittest.mock import MagicMock, patch
+import pytest
+from hud.telemetry.utils import flush_telemetry
+@pytest.mark.asyncio
+async def test_flush_telemetry():
+    """Test flush_telemetry function."""
+    with (
+        patch("hud.otel.config.is_telemetry_configured", return_value=True),
+        patch("hud.utils.hud_console.hud_console"),
+        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
+    ):
+        from opentelemetry.sdk.trace import TracerProvider
+        mock_provider = MagicMock(spec=TracerProvider)
+        mock_provider.force_flush.return_value = True
+        mock_get_provider.return_value = mock_provider
+        await flush_telemetry()
+        mock_provider.force_flush.assert_called_once_with(timeout_millis=5000)
+@pytest.mark.asyncio
+async def test_flush_telemetry_not_configured():
+    """Test flush_telemetry when telemetry is not configured."""
+    with patch("hud.otel.config.is_telemetry_configured", return_value=False):
+        await flush_telemetry()
+@pytest.mark.asyncio
+async def test_flush_telemetry_exception():
+    """Test flush_telemetry handles exceptions gracefully."""
+    with (
+        patch("hud.otel.config.is_telemetry_configured", return_value=True),
+        patch("hud.utils.hud_console.hud_console"),
+        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
+    ):
+        from opentelemetry.sdk.trace import TracerProvider
+        mock_provider = MagicMock(spec=TracerProvider)
+        mock_provider.force_flush.side_effect = Exception("Flush failed")
+        mock_get_provider.return_value = mock_provider
+        await flush_telemetry()
+@pytest.mark.asyncio
+async def test_flush_telemetry_timeout():
+    """Test flush_telemetry when force_flush times out."""
+    with (
+        patch("hud.otel.config.is_telemetry_configured", return_value=True),
+        patch("hud.utils.hud_console.hud_console"),
+        patch("opentelemetry.trace.get_tracer_provider") as mock_get_provider,
+    ):
+        from opentelemetry.sdk.trace import TracerProvider
+        mock_provider = MagicMock(spec=TracerProvider)
+        mock_provider.force_flush.return_value = False
+        mock_get_provider.return_value = mock_provider
+        await flush_telemetry()

{hud_python-0.4.59 → hud_python-0.4.60}/hud/otel/context.py RENAMED Viewed

@@ -22,7 +22,6 @@ if TYPE_CHECKING:
 from hud.settings import settings
 from hud.shared import make_request, make_request_sync
-from hud.utils.async_utils import fire_and_forget
 logger = logging.getLogger(__name__)
@@ -301,32 +300,6 @@ async def _update_task_status_async(
             logger.warning("Failed to update task status: %s", e)
-def _fire_and_forget_status_update(
-    task_run_id: str,
-    status: str,
-    job_id: str | None = None,
-    error_message: str | None = None,
-    trace_name: str | None = None,
-    task_id: str | None = None,
-    group_id: str | None = None,
-    extra_metadata: dict[str, Any] | None = None,
-) -> None:
-    """Fire and forget status update - works in any context including Jupyter."""
-    fire_and_forget(
-        _update_task_status_async(
-            task_run_id,
-            status,
-            job_id,
-            error_message,
-            trace_name,
-            task_id,
-            group_id,
-            extra_metadata,
-        ),
-        f"update task {task_run_id} status to {status}",
-    )
 def _update_task_status_sync(
     task_run_id: str,
     status: str,
@@ -468,7 +441,7 @@ def _print_trace_complete_url(task_run_id: str, error_occurred: bool = False) ->
 class trace:
     """Internal OpenTelemetry trace context manager.
-    This is the implementation class. Users should use hud.trace() instead.
+    This is the sync implementation. For async code, use hud.async_trace() instead.
     """
     def __init__(
@@ -532,9 +505,9 @@ class trace:
         )
         self._span = self._span_manager.__enter__()
-        # Update task status to running if root (only for HUD backend)
+        # Update task status to running (sync call - blocking is expected)
         if self.is_root and settings.telemetry_enabled and settings.api_key:
-            _fire_and_forget_status_update(
+            _update_task_status_sync(
                 self.task_run_id,
                 "running",
                 job_id=self.job_id,
@@ -542,7 +515,6 @@ class trace:
                 task_id=self.task_id,
                 group_id=self.group_id,
             )
-            # Print the nice trace URL box (only if not part of a job)
             if not self.job_id:
                 _print_trace_url(self.task_run_id)
@@ -556,35 +528,20 @@ class trace:
         exc_tb: TracebackType | None,
     ) -> None:
         """Exit the trace context."""
-        # Update task status if root (only for HUD backend)
+        # Update task status (sync call - blocking is expected for sync context manager)
         if self.is_root and settings.telemetry_enabled and settings.api_key:
-            if exc_type is not None:
-                # Use fire-and-forget to avoid blocking the event loop
-                _fire_and_forget_status_update(
-                    self.task_run_id,
-                    "error",
-                    job_id=self.job_id,
-                    error_message=str(exc_val),
-                    trace_name=self.span_name,
-                    task_id=self.task_id,
-                    group_id=self.group_id,
-                )
-                # Print error completion message (only if not part of a job)
-                if not self.job_id:
-                    _print_trace_complete_url(self.task_run_id, error_occurred=True)
-            else:
-                # Use fire-and-forget to avoid blocking the event loop
-                _fire_and_forget_status_update(
-                    self.task_run_id,
-                    "completed",
-                    job_id=self.job_id,
-                    trace_name=self.span_name,
-                    task_id=self.task_id,
-                    group_id=self.group_id,
-                )
-                # Print success completion message (only if not part of a job)
-                if not self.job_id:
-                    _print_trace_complete_url(self.task_run_id, error_occurred=False)
+            status = "error" if exc_type else "completed"
+            _update_task_status_sync(
+                self.task_run_id,
+                status,
+                job_id=self.job_id,
+                error_message=str(exc_val) if exc_val else None,
+                trace_name=self.span_name,
+                task_id=self.task_id,
+                group_id=self.group_id,
+            )
+            if not self.job_id:
+                _print_trace_complete_url(self.task_run_id, error_occurred=bool(exc_type))
         # End the span
         if self._span and self._span_manager is not None:

{hud_python-0.4.59 → hud_python-0.4.60}/hud/rl/actor.py RENAMED Viewed

@@ -109,7 +109,7 @@ class Actor:
         # Run the task
         try:
-            with hud.trace(f"Training | {task.prompt}", job_id=job_id):
+            async with hud.async_trace(f"Training | {task.prompt}", job_id=job_id):
                 result = await agent.run(task, max_steps=self.actor_config.max_steps_per_episode)
         except Exception:

{hud_python-0.4.59 → hud_python-0.4.60}/hud/telemetry/__init__.py RENAMED Viewed

@@ -2,30 +2,27 @@
 Provides telemetry APIs for tracking agent execution and experiments.
-Standard Usage:
+Async Usage (Recommended):
     >>> import hud
-    >>> with hud.trace("My Task"):
-    ...     do_work()
+    >>> async with hud.async_trace("Task"):
+    ...     await agent.run(task)
+    >>> async with hud.async_job("Evaluation") as job:
+    ...     async with hud.async_trace("Task", job_id=job.id):
+    ...         await agent.run(task)
+Sync Usage:
+    >>> import hud
+    >>> with hud.trace("Task"):
+    ...     do_work()
     >>> with hud.job("My Job") as job:
     ...     with hud.trace("Task", job_id=job.id):
     ...         do_work()
-High-Concurrency Usage (200+ parallel tasks):
-    >>> import hud
-    >>> async with hud.async_job("Evaluation") as job:
-    ...     async with hud.async_trace("Task", job_id=job.id):
-    ...         await do_async_work()
 APIs:
-    - trace(), job() - Standard context managers (for typical usage)
-    - async_trace(), async_job() - Async context managers (for high concurrency)
-    - instrument() - Decorator for instrumenting functions
-    - get_trace() - Retrieve collected traces for replay
-Note:
-    Use async_trace/async_job only for high-concurrency scenarios (200+ tasks).
-    The run_dataset() function uses them automatically.
+    - async_trace(), async_job() - Async context managers (recommended)
+    - trace(), job() - Sync context managers
+    - flush_telemetry() - Manual span flushing (rarely needed)
+    - instrument() - Function instrumentation decorator
 """
 from __future__ import annotations

hud-python 0.4.59__tar.gz → 0.4.60__tar.gz

hud-python 0.4.59tar.gz → 0.4.60tar.gz