PyPI - hud-python - Versions diffs - 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl - Mend

hud-python 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (46) hide show

hud/__init__.py +16 -12
hud/adapters/__init__.py +4 -2
hud/adapters/claude/adapter.py +9 -2
hud/adapters/common/adapter.py +11 -10
hud/adapters/common/types.py +34 -13
hud/adapters/operator/__init__.py +5 -0
hud/adapters/operator/adapter.py +97 -0
hud/agent/__init__.py +7 -0
hud/agent/base.py +109 -0
hud/agent/claude.py +207 -0
hud/agent/operator.py +208 -0
hud/env/__init__.py +11 -0
hud/env/client.py +35 -0
hud/env/docker_client.py +306 -0
hud/env/environment.py +354 -0
hud/env/local_docker_client.py +251 -0
hud/env/remote_client.py +185 -0
hud/env/remote_docker_client.py +221 -0
hud/evaluators/__init__.py +10 -0
hud/evaluators/base.py +31 -0
hud/evaluators/inspect.py +29 -0
hud/evaluators/judge.py +213 -0
hud/evaluators/match.py +163 -0
hud/evaluators/remote.py +78 -0
hud/gym.py +101 -15
hud/job.py +185 -0
hud/server/__init__.py +2 -2
hud/server/requests.py +87 -0
hud/settings.py +13 -2
hud/task.py +144 -0
hud/taskset.py +103 -0
hud/trajectory.py +90 -0
hud/types.py +65 -0
hud/utils/__init__.py +4 -2
hud/utils/common.py +96 -0
hud/utils/config.py +91 -4
hud/utils/telemetry.py +67 -0
hud_python-0.2.1.dist-info/METADATA +181 -0
hud_python-0.2.1.dist-info/RECORD +44 -0
{hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/licenses/LICENSE +1 -1
hud/client.py +0 -200
hud/environment.py +0 -318
hud/run.py +0 -208
hud_python-0.1.5.dist-info/METADATA +0 -125
hud_python-0.1.5.dist-info/RECORD +0 -21
{hud_python-0.1.5.dist-info → hud_python-0.2.1.dist-info}/WHEEL +0 -0

hud/evaluators/match.py ADDED Viewed

@@ -0,0 +1,163 @@
+from __future__ import annotations
+import re
+from difflib import SequenceMatcher
+from typing import Any
+from textdistance import levenshtein
+from hud.evaluators.base import EvaluationResult
+def match_single(response: Any, answer: Any) -> EvaluationResult:
+    """Check if the answer is present within the response.
+    Args:
+        response: The response to evaluate
+        answer: The expected answer
+    Returns:
+        EvaluationResult with score=1.0 if match, 0.0 otherwise
+    """
+    passed = str(answer).lower().strip() in str(response).lower().strip()
+    return EvaluationResult(
+        score=1.0 if passed else 0.0,
+        reason="Exact match" if passed else "No exact match found",
+        mode="single"
+    )
+def match_all(response: Any, answers: list) -> EvaluationResult:
+    """Count how many expected answers are in the response.
+    Args:
+        response: The response to evaluate
+        answers: List of expected answers
+    Returns:
+        EvaluationResult with score=proportion of matches (0.0-1.0)
+    """
+    response_str = str(response).lower()
+    matches = 0
+    for answer in answers:
+        if str(answer).lower() in response_str:
+            matches += 1
+    score = matches / len(answers) if answers else 0.0
+    if matches == len(answers):
+        reason = f"All {matches} expected items found"
+    else:
+        reason = f"Only {matches} of {len(answers)} expected items found"
+    return EvaluationResult(
+        score=score,
+        reason=reason,
+        mode="all"
+    )
+def match_fuzzy(response: Any, answer: Any) -> EvaluationResult:
+    """Calculate similarity using Levenshtein distance.
+    Args:
+        response: The response to evaluate
+        answer: The expected answer
+    Returns:
+        EvaluationResult with score=similarity (0.0-1.0)
+    """
+    s1 = str(response).lower()
+    s2 = str(answer).lower()
+    if s1 == s2:
+        score = 1.0
+    elif len(s1) == 0 or len(s2) == 0:
+        score = 0.0
+    else:
+        # Use Levenshtein distance
+        distance = levenshtein.distance(s1, s2)
+        max_len = max(len(s1), len(s2))
+        score = 1.0 - (distance / max_len)
+    return EvaluationResult(
+        score=score,
+        reason=f"Fuzzy match with {score:.1%} similarity",
+        mode="fuzz"
+    )
+def match_regex(response: Any, pattern: str) -> EvaluationResult:
+    """Check if response matches regex pattern.
+    Args:
+        response: The response to evaluate
+        pattern: Regular expression pattern to match
+    Returns:
+        EvaluationResult with score=1.0 if match, 0.0 otherwise
+    """
+    try:
+        regex = re.compile(pattern, re.DOTALL)
+        passed = bool(regex.search(str(response)))
+        return EvaluationResult(
+            score=1.0 if passed else 0.0,
+            reason="Regex pattern matched" if passed else "Regex pattern did not match",
+            mode="regex"
+        )
+    except re.error:
+        return EvaluationResult(
+            score=0.0,
+            reason="Invalid regex pattern",
+            mode="regex"
+        )
+def match_diff(response: Any, answer: Any) -> EvaluationResult:
+    """Compare difference between response and answer.
+    Args:
+        response: The response to evaluate
+        answer: The expected answer
+    Returns:
+        EvaluationResult with score=similarity (0.0-1.0)
+    """
+    if isinstance(response, int | float) and isinstance(answer, int | float):
+        score = _match_numeric_diff(response, answer)
+        reason = f"Numeric difference: {abs(response - answer)}"
+    else:
+        score = _match_string_diff(response, answer)
+        reason = f"String difference with {score:.1%} similarity"
+    return EvaluationResult(
+        score=score,
+        reason=reason,
+        mode="diff"
+    )
+def _match_string_diff(response: Any, answer: Any) -> float:
+    """Compare difference between response and answer strings."""
+    matcher = SequenceMatcher(None, str(response), str(answer))
+    return matcher.ratio()
+def _match_numeric_diff(response: float, answer: float) -> float:
+    """Calculate normalized difference between numeric values.
+    Returns a value between 0 and 1, where 1 means identical and 0 means maximum difference.
+    """
+    if response == answer:
+        return 1.0
+    # Simple absolute difference normalized to a 0-1 scale
+    diff = abs(response - answer)
+    max_val = max(abs(response), abs(answer))
+    if max_val == 0:
+        return 1.0  # Both are zero
+    # Normalize and invert so 1.0 means identical
+    return max(0.0, 1.0 - min(1.0, diff / max_val))

hud/evaluators/remote.py ADDED Viewed

@@ -0,0 +1,78 @@
+from __future__ import annotations
+import asyncio
+from typing import Any
+from hud.evaluators.base import EvaluationResult
+from hud.server import make_request
+from hud.settings import settings
+async def _remote_eval_call(
+    response: Any,
+    answer: Any,
+    eval_type: str,
+    config: dict[str, Any] | None = None
+) -> dict[str, Any]:
+    """Send an evaluation request to the remote server.
+    Args:
+        response: The response to evaluate
+        answer: The reference answer to compare against
+        eval_type: Type of evaluation (e.g., "match", "judge", "agent")
+        config: Optional configuration parameters
+    Returns:
+        Dictionary with evaluation results from the server
+    """
+    try:
+        result = await make_request(
+            method="POST",
+            url=f"{settings.base_url}/evaluations/evaluate",
+            json={
+                "response": response,
+                "answer": answer,
+                "type": eval_type,
+                "config": config or {}
+            },
+            api_key=settings.api_key,
+        )
+        return result
+    except Exception as e:
+        return {
+            "score": -1.0,
+            "reason": f"Remote evaluation failed: {e!s}",
+            "details": {}
+        }
+def remote_evaluate(
+    response: Any,
+    answer: Any,
+    eval_type: str = "default",
+    config: dict[str, Any] | None = None
+) -> EvaluationResult:
+    """Evaluate a response using remote evaluation services.
+    Args:
+        response: The response to evaluate
+        answer: The reference answer to compare against
+        eval_type: Type of evaluation to perform
+        config: Optional configuration for the evaluation
+    Returns:
+        EvaluationResult containing the evaluation results
+    """
+    result = asyncio.run(_remote_eval_call(
+        response=response,
+        answer=answer,
+        eval_type=eval_type,
+        config=config
+    ))
+    return EvaluationResult(
+        score=result.get("score", -1.0),
+        reason=result.get("reason", "Remote evaluation completed"),
+        mode=eval_type,
+        criteria_scores=result.get("details", {})
+    )

hud/gym.py CHANGED Viewed

@@ -1,22 +1,108 @@
 from __future__ import annotations
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
-class Gym:
-    """
-    Represents a simulation environment in the HUD system.
+from hud.env.environment import Environment
+from hud.env.local_docker_client import LocalDockerClient
+from hud.env.remote_client import RemoteClient
+from hud.env.remote_docker_client import RemoteDockerClient
+from hud.task import Task
+from hud.types import CustomGym, Gym
+from hud.utils.common import get_gym_id
+if TYPE_CHECKING:
+    from hud.job import Job
+logger = logging.getLogger("hud.gym")
-    Attributes:
-        id: Unique identifier for the gym
-        name: Human-readable name of the gym
+async def make(
+    env_src: Gym | Task,
+    *,
+    job: Job | None = None,
+    job_id: str | None = None,
+    metadata: dict[str, Any] | None = None,
+) -> Environment:
     """
+    Create an environment from an environment ID or a Task object.
+    Args:
+        env_src: Environment ID or Task object
+        job: Job object to associate with this environment
+        job_id: ID of job to associate with this environment (deprecated, use job instead)
+        metadata: Additional metadata for the environment
+    """
+    if metadata is None:
+        metadata = {}
+    # Handle job parameter
+    effective_job_id = None
+    if job is not None:
+        effective_job_id = job.id
+    elif job_id is not None:
+        effective_job_id = job_id
+    else:
+        # Try to get an active job from the decorator context
+        try:
+            from hud.job import get_active_job
+            active_job = get_active_job()
+            if active_job:
+                effective_job_id = active_job.id
+        except ImportError:
+            pass  # Module not available, skip
+    gym = None
+    task = None
+    if isinstance(env_src, Gym):
+        gym = env_src
+    elif isinstance(env_src, Task):
+        gym = env_src.gym
+        task = env_src
+    if isinstance(gym, CustomGym):
+        # Create the environment (depending on location)
+        if gym.dockerfile is None:
+            raise ValueError("Dockerfile is required for custom environments")
+        if gym.location == "local":
+            logger.info("Creating local environment")
+            client, build_data = await LocalDockerClient.create(gym.dockerfile)
+        elif gym.location == "remote":
+            logger.info("Creating remote environment")
+            client, build_data = await RemoteDockerClient.create(
+                dockerfile=gym.dockerfile,
+                job_id=effective_job_id,
+                task_id=task.id if task else None,
+                metadata=metadata,
+            )
+        else:
+            raise ValueError(f"Invalid environment location: {gym.location}")
+        # Set up the environment with a source path
+        if gym.controller_source_dir:
+            logger.info("Setting source path")
+            client.set_source_path(Path(gym.controller_source_dir))
+    elif isinstance(gym, str):
+        logger.info("Creating private environment")
+        # Note: the gym_name_or_id is a unique identifier, but it is not a true
+        # gym_id for the purposes of building the environment
+        # we therefore fetch the gym_id from the HUD API here
+        true_gym_id = await get_gym_id(gym)
+        # Create the environment
+        client, build_data = await RemoteClient.create(
+            gym_id=true_gym_id,
+            job_id=effective_job_id,
+            task_id=task.id if task else None,
+            metadata=metadata,
+        )
+    else:
+        raise ValueError(f"Invalid gym source: {gym}")
-    def __init__(self, id: str, name: str) -> None:
-        """
-        Initialize a gym.
+    # Create the environment itself
+    environment = Environment(client=client, metadata=metadata, task=task, build_data=build_data)
+    if task:
+        await environment._setup()
-        Args:
-            id: Unique identifier
-            name: Human-readable name
-        """
-        self.id = id
-        self.name = name
+    return environment

hud/job.py ADDED Viewed

@@ -0,0 +1,185 @@
+from __future__ import annotations
+import datetime
+import functools
+import inspect
+import logging
+from collections.abc import Callable
+from typing import Any, TypeVar, cast
+from pydantic import BaseModel, TypeAdapter
+from hud.server import make_request
+from hud.settings import settings
+from hud.trajectory import Trajectory
+logger = logging.getLogger("hud.job")
+# Type variable for the decorator
+T = TypeVar("T", bound=Callable)
+# Global registry to store active jobs created by decorators
+_ACTIVE_JOBS = {}
+class Job(BaseModel):
+    """
+    A job represents a collection of related trajectories.
+    It holds metadata and provides methods to interact with job data.
+    Instances should typically be obtained via `create_job` or `load_job`.
+    """
+    id: str
+    name: str
+    metadata: dict[str, Any] | None = None
+    created_at: datetime.datetime
+    status: str
+    async def load_trajectories(self, *, api_key: str | None = None) -> list[Trajectory]:
+        """
+        Loads the trajectories associated with this job.
+        Returns:
+            List[Trajectory]: The trajectories in the job
+        """
+        api_key = api_key or settings.api_key
+        data = await make_request(
+            method="GET",
+            url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
+            api_key=api_key,
+        )
+        return TypeAdapter(list[Trajectory]).validate_python(data)
+async def create_job(name: str, gym_id: str | None = None,
+                     evalset_id: str | None = None,
+                     metadata: dict[str, Any] | None = None) -> Job:
+    """
+    Creates a new job.
+    Args:
+        name: The name of the job
+        metadata: Metadata for the job
+    Returns:
+        Job: The created job instance
+    """
+    api_key = settings.api_key
+    metadata = metadata or {}
+    data = await make_request(
+        method="POST",
+        url=f"{settings.base_url}/v2/jobs",
+        json={
+            "name": name,
+            "metadata": metadata,
+            "gym_id": gym_id,
+            "evalset_id": evalset_id,
+        },
+        api_key=api_key,
+    )
+    # Assume the backend API returns the full job data upon creation
+    # or at least the necessary fields (id, name, metadata, created_at, status)
+    # If not, we might need to make a subsequent GET request
+    job_data = data # Adjust if the API response structure is different
+    return Job(
+        id=job_data["id"],
+        name=job_data["name"],
+        metadata=job_data.get("metadata", {}), # Ensure metadata is dict
+        created_at=datetime.datetime.fromisoformat(job_data["created_at"]), # Parse datetime
+        status=job_data["status"],
+    )
+async def load_job(job_id: str, api_key: str | None = None) -> Job:
+    """
+    Retrieves a job by its ID.
+    Args:
+        job_id: The ID of the job to retrieve
+    Returns:
+        Job: The retrieved job instance
+    """
+    api_key = api_key or settings.api_key
+    data = await make_request(
+        method="GET",
+        url=f"{settings.base_url}/v2/jobs/{job_id}",
+        api_key=api_key,
+    )
+    if not data:
+        raise ValueError(f"Job {job_id} not found")
+    # Validate and create the Job instance from the fetched data
+    return Job.model_validate(data)
+def job(
+    name: str,
+    metadata: dict[str, Any] | None = None
+) -> Callable[[T], T]:
+    """
+    Decorator to automatically create and associate a job with all environments
+    created within the decorated function.
+    Args:
+        name: The name of the job
+        metadata: Additional metadata for the job
+    Returns:
+        A decorator function that creates a job and associates it with environments
+    """
+    def decorator(func: T) -> T:
+        @functools.wraps(func)
+        async def wrapper(*args: Any, **kwargs: Any) -> Any:
+            # Create a job for this function call using the new function
+            job = await create_job(
+                name=name,
+                metadata=metadata
+            )
+            # Store in global registry with a unique key based on function and call
+            call_id = f"{func.__module__}.{func.__qualname__}_{id(wrapper)}"
+            _ACTIVE_JOBS[call_id] = job
+            try:
+                # Add the function's frame to the stack for lookup
+                frame = inspect.currentframe()
+                if frame:
+                    frame.f_locals["_job_call_id"] = call_id
+                # Run the decorated function
+                result = await func(*args, **kwargs)
+                return result
+            finally:
+                # Clean up
+                if call_id in _ACTIVE_JOBS:
+                    del _ACTIVE_JOBS[call_id]
+        return cast(T, wrapper)
+    return decorator
+def get_active_job() -> Job | None:
+    """
+    Get the currently active job from the call stack, if any.
+    Used internally by gym.make to automatically associate environments with jobs.
+    Returns:
+        The active job or None if no job is active
+    """
+    # Walk up the stack to find any frame with _job_call_id
+    frame = inspect.currentframe()
+    while frame:
+        if "_job_call_id" in frame.f_locals:
+            call_id = frame.f_locals["_job_call_id"]
+            if call_id in _ACTIVE_JOBS:
+                return _ACTIVE_JOBS[call_id]
+        frame = frame.f_back
+    return None

hud/server/__init__.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from __future__ import annotations
-from .requests import RequestError, make_request
+from .requests import RequestError, make_request, make_request_sync
-__all__ = ["RequestError", "make_request"]
+__all__ = ["RequestError", "make_request", "make_request_sync"]

hud/server/requests.py CHANGED Viewed

@@ -6,6 +6,7 @@ from __future__ import annotations
 import asyncio
 import logging
+import time
 from typing import Any
 import httpx
@@ -191,3 +192,89 @@ async def make_request(
         except Exception as e:
             raise RequestError(f"Unexpected error: {e!s}") from None
     raise RequestError(f"Request failed after {max_retries} retries with unknown error")
+def make_request_sync(
+    method: str,
+    url: str,
+    json: Any | None = None,
+    api_key: str | None = None,
+    max_retries: int = 4,
+    retry_delay: float = 2.0,
+) -> dict[str, Any]:
+    """
+    Make a synchronous HTTP request to the HUD API.
+    Args:
+        method: HTTP method (GET, POST, etc.)
+        url: Full URL for the request
+        json: Optional JSON serializable data
+        api_key: API key for authentication
+        max_retries: Maximum number of retries
+        retry_delay: Delay between retries
+    Returns:
+        dict: JSON response from the server
+    Raises:
+        RequestError: If API key is missing or request fails
+    """
+    if not api_key:
+        raise RequestError("API key is required but not provided")
+    headers = {"Authorization": f"Bearer {api_key}"}
+    retry_status_codes = [502, 503, 504]
+    attempt = 0
+    while attempt <= max_retries:
+        attempt += 1
+        try:
+            with httpx.Client(
+                timeout=600.0, # Long running requests can take up to 10 minutes
+                limits=httpx.Limits(
+                    max_connections=1000,
+                    max_keepalive_connections=1000,
+                    keepalive_expiry=10.0,
+                ),
+            ) as client:
+                response = client.request(
+                    method=method, url=url, json=json, headers=headers
+                )
+            # Check if we got a retriable status code
+            if response.status_code in retry_status_codes and attempt <= max_retries:
+                retry_time = retry_delay * (2 ** (attempt - 1))  # Exponential backoff
+                logger.warning(
+                    "Received status %d from %s, retrying in %.2f seconds (attempt %d/%d)",
+                    response.status_code,
+                    url,
+                    retry_time,
+                    attempt,
+                    max_retries,
+                )
+                time.sleep(retry_time)
+                continue
+            response.raise_for_status()
+            result = response.json()
+            return result
+        except httpx.HTTPStatusError as e:
+            raise RequestError.from_http_error(e) from None
+        except httpx.RequestError as e:
+            if attempt <= max_retries:
+                retry_time = retry_delay * (2 ** (attempt - 1))
+                logger.warning(
+                    "Network error %s from %s, retrying in %.2f seconds (attempt %d/%d)",
+                    str(e),
+                    url,
+                    retry_time,
+                    attempt,
+                    max_retries,
+                )
+                time.sleep(retry_time)
+                continue
+            else:
+                raise RequestError(f"Network error: {e!s}") from None
+        except Exception as e:
+            raise RequestError(f"Unexpected error: {e!s}") from None
+    raise RequestError(f"Request failed after {max_retries} retries with unknown error")

hud/settings.py CHANGED Viewed

@@ -15,7 +15,7 @@ class Settings(BaseSettings):
     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="allow")
     base_url: str = Field(
-        default="https://orchestrator.hud.live/hud-gym/api/v1",
+        default="https://orcstaging.hud.so/hud-gym/api",
         description="Base URL for the HUD API",
         validation_alias="base_url",
     )
@@ -25,7 +25,18 @@ class Settings(BaseSettings):
         description="API key for authentication with the HUD API",
         validation_alias="HUD_API_KEY",
     )
+    anthropic_api_key: str | None = Field(
+        default=None,
+        description="API key for Anthropic models",
+        validation_alias="ANTHROPIC_API_KEY",
+    )
+    openai_api_key: str | None = Field(
+        default=None,
+        description="API key for OpenAI models",
+        validation_alias="OPENAI_API_KEY",
+    )
 # Create a singleton instance
 settings = Settings()

hud-python 0.1.5__py3-none-any.whl → 0.2.1__py3-none-any.whl

Potentially problematic release.

hud-python 0.1.5py3-none-any.whl → 0.2.1py3-none-any.whl