PyPI - hud-python - Versions diffs - 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

hud-python 0.2.10py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show

hud/__init__.py +20 -8
hud/adapters/common/adapter.py +14 -3
hud/adapters/common/tests/test_adapter.py +16 -4
hud/datasets.py +188 -0
hud/env/docker_client.py +15 -3
hud/env/environment.py +10 -7
hud/env/local_docker_client.py +29 -7
hud/env/remote_client.py +1 -1
hud/env/remote_docker_client.py +2 -2
hud/exceptions.py +2 -1
hud/gym.py +0 -9
hud/mcp/__init__.py +17 -0
hud/mcp/base.py +631 -0
hud/mcp/claude.py +321 -0
hud/mcp/client.py +312 -0
hud/mcp/langchain.py +250 -0
hud/mcp/openai.py +334 -0
hud/mcp/tests/__init__.py +1 -0
hud/mcp/tests/test_base.py +512 -0
hud/mcp/tests/test_claude.py +294 -0
hud/mcp/tests/test_client.py +324 -0
hud/mcp/tests/test_openai.py +238 -0
hud/settings.py +20 -2
hud/task.py +5 -88
hud/taskset.py +2 -23
hud/telemetry/__init__.py +16 -7
hud/telemetry/_trace.py +246 -72
hud/telemetry/context.py +88 -27
hud/telemetry/exporter.py +171 -11
hud/telemetry/instrumentation/mcp.py +174 -410
hud/telemetry/job.py +141 -0
hud/telemetry/mcp_models.py +13 -74
hud/telemetry/tests/test_context.py +9 -6
hud/telemetry/tests/test_trace.py +120 -78
hud/tools/__init__.py +34 -0
hud/tools/base.py +65 -0
hud/tools/bash.py +137 -0
hud/tools/computer/__init__.py +13 -0
hud/tools/computer/anthropic.py +411 -0
hud/tools/computer/hud.py +315 -0
hud/tools/computer/openai.py +283 -0
hud/tools/edit.py +290 -0
hud/tools/executors/__init__.py +30 -0
hud/tools/executors/base.py +331 -0
hud/tools/executors/pyautogui.py +619 -0
hud/tools/executors/tests/__init__.py +1 -0
hud/tools/executors/tests/test_base_executor.py +338 -0
hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
hud/tools/executors/xdo.py +503 -0
hud/tools/helper/README.md +56 -0
hud/tools/helper/__init__.py +9 -0
hud/tools/helper/mcp_server.py +78 -0
hud/tools/helper/server_initialization.py +115 -0
hud/tools/helper/utils.py +58 -0
hud/tools/playwright_tool.py +379 -0
hud/tools/tests/__init__.py +3 -0
hud/tools/tests/test_bash.py +152 -0
hud/tools/tests/test_computer.py +52 -0
hud/tools/tests/test_computer_actions.py +34 -0
hud/tools/tests/test_edit.py +240 -0
hud/tools/tests/test_init.py +27 -0
hud/tools/tests/test_playwright_tool.py +183 -0
hud/tools/tests/test_tools.py +157 -0
hud/tools/tests/test_utils.py +156 -0
hud/tools/utils.py +50 -0
hud/trajectory.py +5 -1
hud/types.py +10 -1
hud/utils/tests/test_init.py +21 -0
hud/utils/tests/test_version.py +1 -1
hud/version.py +1 -1
{hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
hud_python-0.3.1.dist-info/RECORD +119 -0
hud/evaluators/__init__.py +0 -9
hud/evaluators/base.py +0 -32
hud/evaluators/inspect.py +0 -24
hud/evaluators/judge.py +0 -189
hud/evaluators/match.py +0 -156
hud/evaluators/remote.py +0 -65
hud/evaluators/tests/__init__.py +0 -0
hud/evaluators/tests/test_inspect.py +0 -12
hud/evaluators/tests/test_judge.py +0 -231
hud/evaluators/tests/test_match.py +0 -115
hud/evaluators/tests/test_remote.py +0 -98
hud_python-0.2.10.dist-info/RECORD +0 -85
{hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
{hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0

hud/telemetry/exporter.py CHANGED Viewed

@@ -2,17 +2,19 @@ from __future__ import annotations
 import asyncio
 import concurrent.futures  # For run_coroutine_threadsafe return type
+import enum
 import json
 import logging
 import threading
 import time
-from datetime import datetime, timezone  # For ISO timestamp conversion
+from datetime import UTC, datetime  # For ISO timestamp conversion
 from typing import TYPE_CHECKING, Any
 if TYPE_CHECKING:
     from collections.abc import Coroutine
 import httpx
+from pydantic import BaseModel
 from hud.settings import settings
@@ -25,6 +27,41 @@ from hud.telemetry.mcp_models import (  # MCPResponseCall for isinstance check
 logger = logging.getLogger("hud.telemetry")
+# --- Task Run Status Models ---
+class TaskRunStatus(enum.StrEnum):
+    INITIALIZING = "initializing"
+    RUNNING = "running"
+    EVALUATING = "evaluating"
+    COMPLETED = "completed"
+    ERROR = "error"
+class TaskRunStatusUpdateRequest(BaseModel):
+    """Request model for updating task run status."""
+    status: TaskRunStatus
+    error_message: str | None = None  # Optional error message if status is ERROR
+    metadata: dict[str, Any] | None = None  # Optional metadata for context
+    job_id: str | None = None  # Optional parent job ID
+# --- Job Status Models ---
+class JobStatus(enum.StrEnum):
+    RUNNING = "running"
+    COMPLETED = "completed"
+    ERROR = "error"
+class JobStatusUpdateRequest(BaseModel):
+    """Request model for updating job status."""
+    status: JobStatus
+    error_message: str | None = None  # Optional error message if status is ERROR
+    metadata: dict[str, Any] | None = None  # Optional metadata for context
+    taskset_name: str | None = None  # Optional dataset/taskset name
 # --- Worker Thread and Event Loop Management ---
 _worker_thread: threading.Thread | None = None
 _worker_loop: asyncio.AbstractEventLoop | None = None
@@ -38,7 +75,8 @@ _export_lock_async = asyncio.Lock()  # Async lock for the async queue
 _export_task_async: asyncio.Task | None = None  # Async task for processing the queue
 # --- Constants ---
-EXPORT_INTERVAL = 5.0  # seconds
+EXPORT_INTERVAL = 5.0  # seconds - delay between non-incremental exports
+MIN_EXPORT_INTERVAL = 0.1  # seconds - minimum delay between any exports to avoid overwhelming
 # MAX_BATCH_SIZE removed as we send one trace payload at a time
@@ -157,7 +195,7 @@ async def export_telemetry(
         actual_start_time_float = getattr(mcp_call_model, "start_time", None)
         if actual_start_time_float:
             start_ts_iso = (
-                datetime.fromtimestamp(actual_start_time_float, timezone.utc)
+                datetime.fromtimestamp(actual_start_time_float, UTC)
                 .isoformat()
                 .replace("+00:00", "Z")
             )
@@ -170,7 +208,7 @@ async def export_telemetry(
         if effective_end_timestamp_float:
             end_ts_iso = (
-                datetime.fromtimestamp(effective_end_timestamp_float, timezone.utc)
+                datetime.fromtimestamp(effective_end_timestamp_float, UTC)
                 .isoformat()
                 .replace("+00:00", "Z")
             )
@@ -265,12 +303,19 @@ async def _process_export_queue_async() -> None:
             if isinstance(payload_to_process, dict):  # Ensure it's a dict before processing as such
                 await _export_trace_payload_async(payload_to_process)
+                # Apply appropriate delay based on export type
+                is_incremental = payload_to_process.get("attributes", {}).get("incremental", False)
+                if is_incremental:
+                    # Small delay for incremental exports to avoid overwhelming the server
+                    await asyncio.sleep(MIN_EXPORT_INTERVAL)
+                else:
+                    # Longer delay for final exports
+                    await asyncio.sleep(EXPORT_INTERVAL)
             else:
                 # Should not happen if only dicts and sentinel are queued
                 logger.warning("Unexpected item in telemetry queue: %s", type(payload_to_process))
-            await asyncio.sleep(EXPORT_INTERVAL)
     except asyncio.CancelledError:
         logger.debug("Async telemetry export processing task cancelled.")
         _export_task_async = None
@@ -340,6 +385,119 @@ async def send_telemetry_to_server(task_run_id: str, data: dict[str, Any]) -> No
         logger.exception("Error exporting telemetry for task run %s: %s", task_run_id, e)
+async def update_task_run_status(
+    task_run_id: str,
+    status: TaskRunStatus,
+    error_message: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    job_id: str | None = None,
+) -> None:
+    """Update the status of a task run."""
+    if not settings.telemetry_enabled:
+        logger.debug("Status update skipped - telemetry not enabled")
+        return
+    status_url = f"{settings.base_url}/v2/task_runs/{task_run_id}/status"
+    try:
+        async with httpx.AsyncClient() as client:
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {settings.api_key}",
+            }
+            request_data = TaskRunStatusUpdateRequest(
+                status=status, error_message=error_message, metadata=metadata, job_id=job_id
+            )
+            logger.debug(
+                "Updating status for task run %s to %s",
+                task_run_id,
+                status,
+            )
+            response = await client.post(
+                status_url,
+                json=request_data.model_dump(exclude_none=True),
+                headers=headers,
+                timeout=10.0,
+            )
+            if response.status_code >= 200 and response.status_code < 300:
+                logger.debug(
+                    "Successfully updated status for task run %s to %s",
+                    task_run_id,
+                    status,
+                )
+            else:
+                logger.warning(
+                    "Failed to update status for task run %s: HTTP %s - %s",
+                    task_run_id,
+                    response.status_code,
+                    response.text,
+                )
+    except Exception as e:
+        logger.exception("Error updating status for task run %s: %s", task_run_id, e)
+async def update_job_status(
+    job_id: str,
+    status: JobStatus,
+    error_message: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    taskset_name: str | None = None,
+) -> None:
+    """Update the status of a job."""
+    if not settings.telemetry_enabled:
+        logger.debug("Job status update skipped - telemetry not enabled")
+        return
+    status_url = f"{settings.base_url}/v2/jobs/{job_id}/status"
+    try:
+        async with httpx.AsyncClient() as client:
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {settings.api_key}",
+            }
+            request_data = JobStatusUpdateRequest(
+                status=status,
+                error_message=error_message,
+                metadata=metadata,
+                taskset_name=taskset_name,
+            )
+            logger.debug(
+                "Updating status for job %s to %s",
+                job_id,
+                status,
+            )
+            response = await client.post(
+                status_url,
+                json=request_data.model_dump(exclude_none=True),
+                headers=headers,
+                timeout=10.0,
+            )
+            if response.status_code >= 200 and response.status_code < 300:
+                logger.debug(
+                    "Successfully updated status for job %s to %s",
+                    job_id,
+                    status,
+                )
+            else:
+                logger.warning(
+                    "Failed to update status for job %s: HTTP %s - %s",
+                    job_id,
+                    response.status_code,
+                    response.text,
+                )
+    except Exception as e:
+        logger.exception("Error updating status for job %s: %s", job_id, e)
 # --- Public Shutdown Function ---
 def flush(timeout: float = 10.0) -> None:
     """Flushes pending telemetry data and stops the worker thread."""
@@ -375,15 +533,17 @@ def flush(timeout: float = 10.0) -> None:
         # This check is racy, but it's the best we can do without more complex inter-thread
         # sync for task completion. Give some time for the task to process the sentinel and
         # clear itself.
-        # Max wait for task to clear
-        attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0)
+        # Max wait for task to clear - should be longer than EXPORT_INTERVAL to ensure
+        # the task has time to wake from sleep and process the sentinel
+        attempt_timeout = time.time() + (timeout / 2 if timeout else 2.0) + EXPORT_INTERVAL + 1.0
         while _export_task_async is not None and time.time() < attempt_timeout:
             time.sleep(0.1)
             # _export_task_async is set to None by _process_export_queue_async upon its exit.
         if _export_task_async is not None:
-            logger.warning(
-                "Telemetry processing task did not clear itself after sentinel. May still be "
-                "running or stuck."
+            # This is often a false positive due to race conditions during shutdown
+            logger.debug(
+                "Telemetry processing task did not clear itself after sentinel. "
+                "This is normal during shutdown."
             )
         else:
             logger.debug("Telemetry processing task appears to have completed after sentinel.")

hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.10py3-none-any.whl → 0.3.1py3-none-any.whl