PyPI - hud-python - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

hud-python 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hud-python might be problematic. Click here for more details.

Files changed (26) hide show

hud/__init__.py +3 -2
hud/adapters/__init__.py +2 -1
hud/adapters/claude/adapter.py +15 -2
hud/adapters/common/types.py +7 -3
hud/adapters/operator/adapter.py +10 -6
hud/agent/__init__.py +2 -1
hud/agent/claude.py +22 -2
hud/agent/langchain.py +198 -0
hud/agent/operator.py +35 -17
hud/env/docker_client.py +1 -1
hud/env/environment.py +182 -9
hud/env/local_docker_client.py +3 -1
hud/env/remote_client.py +4 -0
hud/gym.py +3 -3
hud/job.py +420 -12
hud/task.py +41 -30
hud/taskset.py +8 -0
hud/types.py +5 -3
hud/utils/common.py +31 -1
hud/utils/config.py +2 -93
hud/utils/progress.py +136 -0
{hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/METADATA +52 -39
hud_python-0.2.2.dist-info/RECORD +46 -0
hud_python-0.2.0.dist-info/RECORD +0 -44
{hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/WHEEL +0 -0
{hud_python-0.2.0.dist-info → hud_python-0.2.2.dist-info}/licenses/LICENSE +0 -0

hud/job.py CHANGED Viewed

@@ -1,17 +1,27 @@
 from __future__ import annotations
+import asyncio
 import datetime
 import functools
 import inspect
 import logging
-from collections.abc import Callable
-from typing import Any, TypeVar, cast
+import sys
+from collections.abc import Callable, Coroutine
+from typing import TYPE_CHECKING, Any, TypeVar, cast
-from pydantic import BaseModel, TypeAdapter
+from pydantic import BaseModel, PrivateAttr, TypeAdapter
+from hud import gym
 from hud.server import make_request
 from hud.settings import settings
+from hud.task import Task
+from hud.taskset import TaskSet
 from hud.trajectory import Trajectory
+from hud.utils.progress import StepProgressTracker
+if TYPE_CHECKING:
+    from hud.adapters.common import Adapter
+    from hud.agent.base import Agent
 logger = logging.getLogger("hud.job")
@@ -25,7 +35,7 @@ class Job(BaseModel):
     """
     A job represents a collection of related trajectories.
     It holds metadata and provides methods to interact with job data.
-    Instances should typically be obtained via `create_job` or `load_job`.
+    Instances should typically be obtained via `create_job`, `load_job`, or the new `run_job`.
     """
     id: str
@@ -34,23 +44,85 @@ class Job(BaseModel):
     created_at: datetime.datetime
     status: str
-    async def load_trajectories(self, *, api_key: str | None = None) -> list[Trajectory]:
+    # Internal cache for trajectories
+    _trajectories: list[Trajectory] | None = PrivateAttr(default=None)
+    # Store execution errors for debugging
+    errors: list[dict[str, Any]] = []
+    async def load_trajectories(
+            self, *, api_key: str | None = None, force_reload: bool = False
+                                ) -> list[Trajectory]:
         """
         Loads the trajectories associated with this job.
+        Uses cached results unless force_reload is True.
+        Args:
+            api_key: Optional API key.
+            force_reload: If True, fetches trajectories from the API even if cached.
         Returns:
             List[Trajectory]: The trajectories in the job
         """
+        if self._trajectories is not None and not force_reload:
+            logger.debug("Returning cached trajectories for Job %s", self.id)
+            return self._trajectories
+        logger.debug("Fetching trajectories for Job %s from API...", self.id)
         api_key = api_key or settings.api_key
-        data = await make_request(
-            method="GET",
-            url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
-            api_key=api_key,
-        )
+        try:
+            data = await make_request(
+                method="GET",
+                url=f"{settings.base_url}/v2/jobs/{self.id}/trajectories",
+                api_key=api_key,
+            )
+            self._trajectories = TypeAdapter(list[Trajectory]).validate_python(data)
+            logger.debug("Loaded %d trajectories for Job %s", len(self._trajectories), self.id)
+            return self._trajectories
+        except Exception as e:
+            logger.exception("Failed to load trajectories for Job %s: %s", self.id, e)
+            self._trajectories = None # Ensure cache is cleared on error
+            return [] # Return empty list on error
+    async def get_analytics(self, *, force_reload: bool = False) -> dict[str, Any]:
+        """
+        Calculates and returns analytics for the job based on its trajectories.
+        Args:
+            force_reload: If True, re-fetches trajectories before calculating.
+        Returns:
+            Dictionary containing analytics (e.g., task_count, avg_reward).
+        """
+        trajectories = await self.load_trajectories(force_reload=force_reload)
-        return TypeAdapter(list[Trajectory]).validate_python(data)
+        task_count = len(trajectories)
+        if task_count == 0:
+            return {"task_count": 0, "avg_reward": None, "success_rate": None} # Or other default
+        total_reward = 0
+        successful_tasks = 0
+        valid_rewards = 0
+        for traj in trajectories:
+            # Example: Assume reward is numeric and success is reward >= 1.0
+            # Adjust based on actual trajectory data structure and evaluation logic
+            if isinstance(traj.reward, int | float):
+                total_reward += traj.reward
+                valid_rewards += 1
+                if traj.reward >= 1.0:
+                     successful_tasks += 1
+            # Add more complex logic here if needed based on traj.evaluation_result or metadata
+        avg_reward = (total_reward / valid_rewards) if valid_rewards > 0 else None
+        success_rate = (successful_tasks / task_count) * 100 if task_count > 0 else None
+        return {
+            "task_count": task_count,
+            "avg_reward": avg_reward,
+            "success_rate": success_rate,
+            # Add other relevant stats here
+        }
 async def create_job(name: str, gym_id: str | None = None,
                      evalset_id: str | None = None,
@@ -84,7 +156,9 @@ async def create_job(name: str, gym_id: str | None = None,
     # or at least the necessary fields (id, name, metadata, created_at, status)
     # If not, we might need to make a subsequent GET request
     job_data = data # Adjust if the API response structure is different
+    logger.info("[HUD] View job at https://app.hud.so/jobs/%s.", job_data["id"])
     return Job(
         id=job_data["id"],
         name=job_data["name"],
@@ -183,3 +257,337 @@ def get_active_job() -> Job | None:
         frame = frame.f_back
     return None
+# --- Moved helper functions from runner.py ---
+async def _execute_task(
+    agent_cls: type[Agent],
+    adapter_cls: type[Adapter] | None,
+    agent_kwargs: dict[str, Any] | None,
+    adapter_kwargs: dict[str, Any] | None,
+    task: Task,
+    job_name: str,
+    task_id: str,
+    max_steps_per_task: int,
+    job: Job,
+    tracker: StepProgressTracker | None = None,
+    # Use semaphores instead of rate limiter
+    env_creation_semaphore: asyncio.Semaphore | None = None,
+    agent_predict_semaphore: asyncio.Semaphore | None = None,
+) -> None:
+    """Helper function to instantiate/run/evaluate a single task, with concurrency limits via
+    semaphores."""
+    if tracker:
+        tracker.start_task(task_id)
+    env = None
+    agent_instance: Agent | None = None
+    status = "error"
+    error_msg = "Initialization failed"
+    try:
+        adapter_instance = None
+        if adapter_cls:
+            adapter_instance = adapter_cls(**(adapter_kwargs or {}))
+        agent_instance = agent_cls(adapter=adapter_instance, **(agent_kwargs or {}))
+        if agent_instance is None:
+            raise RuntimeError("Agent could not be instantiated")
+        # Environment creation with semaphore
+        if env_creation_semaphore:
+            async with env_creation_semaphore:
+                env = await gym.make(task, job=job)
+        else:
+            env = await gym.make(task, job=job)
+        obs_tuple = await env.reset()
+        if obs_tuple is None:
+            raise ValueError(f"env.reset() returned None for task {task_id}")
+        obs, _ = obs_tuple
+        step_error = None
+        for step in range(max_steps_per_task):
+            action, done = (None, False)
+            try:
+                # Agent prediction with semaphore
+                if agent_predict_semaphore:
+                    async with agent_predict_semaphore:
+                        action, done = await agent_instance.predict(obs)
+                else:
+                    action, done = await agent_instance.predict(obs)
+                if tracker:
+                    tracker.increment_step(task_id)
+                if action is None and not done:
+                    done = True
+                step_result = await env.step(action)
+                if step_result is None:
+                    terminated = True
+                else:
+                    obs, _, terminated, _ = step_result
+                if terminated or done:
+                    break
+            except Exception as agent_step_err:
+                logger.exception("[Job: %s/%s, Task: %s] Step %d Error: %s", job.name, job.id,
+                                 task_id, step + 1, agent_step_err)
+                step_error = f"Error at step {step + 1}: {agent_step_err}"
+                # Store step error in job
+                job.errors.append({
+                    "task_id": task_id,
+                    "type": "step_error",
+                    "step": step + 1,
+                    "error": str(agent_step_err),
+                    "timestamp": datetime.datetime.now().isoformat()
+                })
+                break
+        else:
+            logger.warning("[Job: %s/%s, Task: %s] Max steps reached.", job.name, job.id, task_id)
+        # --- Evaluate Task ---
+        evaluation_result = None
+        if step_error:
+            status = "error"
+            error_msg = step_error
+        else:
+            try:
+                evaluation_result = await env.evaluate()
+                status = "completed"
+                error_msg = None
+            except Exception as eval_err:
+                logger.exception("[Job: %s/%s, Task: %s] Evaluation Error: %s", job.name,
+                                 job.id, task_id, eval_err)
+                status = "error"
+                error_msg = f"Evaluation failed: {eval_err}"
+                # Store evaluation error in job
+                job.errors.append({
+                    "task_id": task_id,
+                    "type": "evaluation_error",
+                    "error": str(eval_err),
+                    "timestamp": datetime.datetime.now().isoformat()
+                })
+    except Exception as e:
+        logger.exception("[Job: %s/%s, Task: %s] Setup/Run Error: %s", job.name, job.id, task_id, e)
+        status = "error"
+        error_msg = str(e)
+        # Store setup/initialization error in job
+        job.errors.append({
+            "task_id": task_id,
+            "type": "setup_error",
+            "error": str(e),
+            "timestamp": datetime.datetime.now().isoformat()
+        })
+    finally:
+        if tracker:
+            tracker.finish_task(task_id)
+        if env:
+            try:
+                await env.close()
+            except Exception as close_err:
+                logger.exception("[Job: %s/%s, Task: %s] Close Error: %s", job.name, job.id,
+                                 task_id, close_err)
+                # Store environment close error in job
+                job.errors.append({
+                    "task_id": task_id,
+                    "type": "env_close_error",
+                    "error": str(close_err),
+                    "timestamp": datetime.datetime.now().isoformat()
+                })
+    log_suffix = f" Error: {error_msg}" if status == "error" else f" Eval: {evaluation_result}"
+    logger.info("[Job: %s/%s, Task: %s] Finished local execution. Status: %s.%s", job.name,
+                job.id, task_id, status, log_suffix)
+async def _progress_monitor(tracker: StepProgressTracker, interval: float = 1.0) -> None:
+    """Coroutine to periodically display progress using the tracker."""
+    try:
+        while not tracker.is_finished():
+            sys.stderr.write(f"\r{tracker.display()}")
+            sys.stderr.flush()
+            await asyncio.sleep(interval)
+        sys.stderr.write(f"\r{tracker.display()}\n")
+        sys.stderr.flush()
+        logger.debug("Progress monitor finished.")
+    except asyncio.CancelledError:
+        sys.stderr.write("\nProgress monitor cancelled.\n")
+        sys.stderr.flush()
+        logger.debug("Progress monitor cancelled.")
+    except Exception as e:
+        sys.stderr.write(f"\nProgress monitor error: {e}\n")
+        sys.stderr.flush()
+        logger.exception("Progress monitor error: %s", e)
+# --- New run_job function ---
+async def run_job(
+    agent_cls: type[Agent],
+    task_or_taskset: Task | TaskSet,
+    job_name: str,
+    adapter_cls: type[Adapter] | None = None,
+    agent_kwargs: dict[str, Any] | None = None,
+    adapter_kwargs: dict[str, Any] | None = None,
+    max_steps_per_task: int = 20,
+    run_parallel: bool = True,
+    job_metadata: dict[str, Any] | None = None,
+    show_progress: bool = True,
+    # Concurrency control with semaphores
+    max_concurrent_env_creations: int | None = 30,  # Limits env.make calls
+    max_concurrent_agent_predictions: int | None = 30,  # Limits agent.predict calls
+    max_concurrent_tasks: int | None = 30,  # Limits overall task concurrency
+) -> Job:
+    """
+    Creates Job, executes tasks locally, linking them to the Job.
+    Instantiates agent/adapter per task. Shows step-based progress.
+    Controls concurrency in three ways:
+    1. Limits concurrent environment creations
+    2. Limits concurrent agent predictions
+    3. Limits overall concurrent tasks (when run_parallel=True)
+    All concurrency controls use semaphores for reliability.
+    Tracks all errors that occur during execution in job.errors.
+    Args:
+        agent_cls: Agent class to instantiate.
+        task_or_taskset: Task or TaskSet to run.
+        job_name: Name for the Job.
+        adapter_cls: Optional Adapter class.
+        agent_kwargs: Optional kwargs for agent constructor.
+        adapter_kwargs: Optional kwargs for adapter constructor.
+        max_steps_per_task: Step limit per task.
+        run_parallel: Run TaskSet tasks concurrently if True (limited by max_concurrent_tasks).
+        job_metadata: Metadata for the created Job.
+        show_progress: Display the step-based progress tracker.
+        max_concurrent_env_creations: Max concurrent environment creation calls.
+        max_concurrent_agent_predictions: Max concurrent agent prediction calls.
+        max_concurrent_tasks: Max number of tasks to run actively at the same time.
+    Returns:
+        The created Job object with errors stored in job.errors.
+    """
+    tasks_to_run: list[Task] = []
+    created_job: Job | None = None
+    # --- Create Job ---
+    try:
+        logger.info("Creating job with name: '%s'", job_name)
+        created_job = await create_job(name=job_name, metadata=job_metadata)
+        logger.info("Created job with ID: %s", created_job.id)
+    except Exception as e:
+        logger.exception("Failed to create job '%s': %s", job_name, e)
+        raise
+    # --- Task Setup ---
+    is_taskset = isinstance(task_or_taskset, TaskSet)
+    if is_taskset:
+        tasks_to_run = task_or_taskset.tasks if task_or_taskset.tasks else []
+    elif isinstance(task_or_taskset, Task):
+        tasks_to_run = [task_or_taskset]
+        run_parallel = False
+    else:
+        raise TypeError("task_or_taskset must be either a Task or a TaskSet")
+    if not tasks_to_run:
+        logger.warning("Job '%s' (%s): No tasks found to run.", created_job.name, created_job.id)
+        return created_job
+    task_ids = [(str(task.id) if task.id else f"task_{i}") for i, task in enumerate(tasks_to_run)]
+    num_tasks = len(tasks_to_run)
+    # --- Create semaphores for concurrency control ---
+    env_creation_sema = None
+    if max_concurrent_env_creations and max_concurrent_env_creations > 0:
+        env_creation_sema = asyncio.Semaphore(max_concurrent_env_creations)
+        logger.info("Limiting concurrent environment creations to %d.",
+                    max_concurrent_env_creations)
+    agent_predict_sema = None
+    if max_concurrent_agent_predictions and max_concurrent_agent_predictions > 0:
+        agent_predict_sema = asyncio.Semaphore(max_concurrent_agent_predictions)
+        logger.info("Limiting concurrent agent predictions to %d.",
+                    max_concurrent_agent_predictions)
+    task_execution_sema = None
+    effective_concurrency = num_tasks  # Default to running all if parallel
+    if run_parallel and max_concurrent_tasks and max_concurrent_tasks > 0:
+        effective_concurrency = min(num_tasks, max_concurrent_tasks)
+        task_execution_sema = asyncio.Semaphore(effective_concurrency)
+        logger.info("Limiting concurrent task executions to %d.", effective_concurrency)
+    elif not run_parallel:
+        effective_concurrency = 1  # Sequential means concurrency of 1
+    # --- Instantiate Tracker & Start Monitor ---
+    tracker = None
+    monitor_task = None
+    if show_progress and num_tasks > 0:
+        tracker = StepProgressTracker(total_tasks=num_tasks, max_steps_per_task=max_steps_per_task)
+        monitor_task = asyncio.create_task(_progress_monitor(tracker))
+    # --- Execute Tasks ---
+    job_desc_suffix = f" (Job ID: {created_job.id})"
+    async def task_wrapper(task_coro: Coroutine, semaphore: asyncio.Semaphore | None) -> None:
+        if semaphore:
+            async with semaphore:
+                await task_coro
+        else:
+             await task_coro
+    try:
+        if run_parallel and is_taskset:
+            logger.info("Job '%s'%s: Running %d tasks with concurrency %d.", created_job.name,
+                        job_desc_suffix, num_tasks, effective_concurrency)
+            task_coroutines = [
+                _execute_task(
+                    agent_cls=agent_cls, adapter_cls=adapter_cls, agent_kwargs=agent_kwargs,
+                    adapter_kwargs=adapter_kwargs, task=task, job_name=created_job.name,
+                    task_id=task_id,
+                    max_steps_per_task=max_steps_per_task, job=created_job, tracker=tracker,
+                    env_creation_semaphore=env_creation_sema,
+                    agent_predict_semaphore=agent_predict_sema,
+                )
+                for task, task_id in zip(tasks_to_run, task_ids, strict=True)
+            ]
+            # Wrap coroutines with semaphore management if limiting concurrency
+            wrapped_tasks = [
+                task_wrapper(coro, task_execution_sema)
+                for i, coro in enumerate(task_coroutines)
+            ]
+            # Run all wrapped tasks
+            await asyncio.gather(*wrapped_tasks)
+        else:
+            # SEQUENTIAL (or single task)
+            logger.info("Job '%s'%s: Running %d tasks sequentially.", created_job.name,
+                        job_desc_suffix, num_tasks)
+            for i, task in enumerate(tasks_to_run):
+                task_id = task_ids[i]
+                await _execute_task(
+                    agent_cls=agent_cls, adapter_cls=adapter_cls, agent_kwargs=agent_kwargs,
+                    adapter_kwargs=adapter_kwargs, task=task, job_name=created_job.name,
+                    task_id=task_id,
+                    max_steps_per_task=max_steps_per_task, job=created_job, tracker=tracker,
+                    env_creation_semaphore=env_creation_sema,
+                    agent_predict_semaphore=agent_predict_sema,
+                )
+    finally:
+        # Ensure monitor task is stopped and awaited cleanly
+        if monitor_task is not None and not monitor_task.done():
+            monitor_task.cancel()
+            try:
+                await monitor_task
+            except asyncio.CancelledError:
+                pass
+            except Exception as e:
+                logger.error("Error awaiting progress monitor task: %s", e)
+    logger.info("Job '%s'%s finished local execution phase for %d tasks.", created_job.name,
+                job_desc_suffix, num_tasks)
+    return created_job

hud/task.py CHANGED Viewed

@@ -5,8 +5,7 @@ from typing import TYPE_CHECKING, Any
 from pydantic import BaseModel
 from hud.types import CustomGym, Gym
-from hud.utils import HudStyleConfig
-from hud.utils.config import HudStyleConfigs
+from hud.utils.common import HudStyleConfig, HudStyleConfigs
 if TYPE_CHECKING:
     from inspect_ai.dataset import Sample
@@ -35,7 +34,7 @@ class Task(BaseModel):
     The setup and evaluate configurations can be in several formats:
     - String (function name): "chrome.maximize"
-    - String (function with args): "chrome.activate_tab 5"
+    - Tuple (function with args): ("chrome.activate_tab", 5)
     - Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
     - List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
@@ -68,15 +67,15 @@ class Task(BaseModel):
     @classmethod
     def from_inspect_sample(cls, sample: Sample) -> Task:
         """Create a Task from an Inspect dataset sample.
-        The task's sandbox is a local ubuntu container using the standard controller.
-        Files will be copied to the user directory
+        Automatically detects if a CustomGym (docker) or QA Gym is needed based on sample.sandbox.
+        Configures evaluation using 'response_includes' or 'match_all' based on sample.target.
         Args:
             sample: An Inspect dataset Sample object
         Returns:
             Task instance
         The Inspect Sample has these fields:
         - input (str | list[ChatMessage]): The input to be submitted to the model
         - choices (list[str] | None): Optional multiple choice answer list
@@ -87,10 +86,8 @@ class Task(BaseModel):
         - files (dict[str, str] | None): Optional files that go with the sample
         - setup (str | None): Optional setup script to run for sample
         """
-        # Extract the input as prompt
         prompt = sample.input
-        if isinstance(prompt, list):  # Handle ChatMessage format
-            # Convert chat message list to a string representation
+        if isinstance(prompt, list):
             prompt_parts = []
             for message in prompt:
                 role = message.role
@@ -98,36 +95,50 @@ class Task(BaseModel):
                 prompt_parts.append(f"{role.capitalize()}: {content}")
             prompt = "\n\n".join(prompt_parts)
-        # Map sandbox from Inspect to our envspec
+        evaluate_config = None
+        if sample.target:
+            if isinstance(sample.target, str):
+                evaluate_config = ("response_includes", [sample.target])
+            elif isinstance(sample.target, list):
+                evaluate_config = ("match_all", sample.target)
+        task_gym: Gym | None = None
+        task_setup: HudStyleConfigs | None = None
         sandbox = sample.sandbox
         dockerfile = None
+        use_qa_gym = True
         if sandbox:
             if isinstance(sandbox, str):
-                if sandbox != "docker":
-                    raise ValueError("docker is the only supported sandbox")
+                if sandbox == "docker":
+                    dockerfile = UBUNTU_DOCKERFILE
+                    use_qa_gym = False
             elif isinstance(sandbox, tuple) and len(sandbox) == 2:
                 sandbox_type, sandbox_config = sandbox
-                if sandbox_type != "docker":
-                    raise ValueError("docker is the only supported sandbox")
-                dockerfile = sandbox_config
-            else:
-                raise ValueError("Invalid sandbox configuration")
-        gym = CustomGym(
-            dockerfile=dockerfile or UBUNTU_DOCKERFILE,
-            location="local",
-        )
+                if sandbox_type == "docker":
+                    dockerfile = sandbox_config
+                    use_qa_gym = False
+        if use_qa_gym:
+            task_gym = "qa"
+            task_setup = None
+        else:
+            task_gym = CustomGym(
+                dockerfile=dockerfile or UBUNTU_DOCKERFILE,
+                location="local",
+            )
+            task_setup = [x for x in convert_inspect_setup(sample.setup)] if sample.setup else None
+            # TODO: Handle sample.files for CustomGym case if needed
         return cls(
-            id=str(sample.id) if sample.id else None,
+            id=None,
             prompt=prompt,
-            setup=[x for x in convert_inspect_setup(sample.setup)] if sample.setup else [],
+            setup=task_setup,
             metadata=sample.metadata,
             choices=sample.choices,
-            target=sample.target,
-            gym=gym,
+            evaluate=evaluate_config,
+            gym=task_gym,
+            # files=sample.files, # TODO: Decide how/if to handle files
         )
-    def convert_sdk01(self) -> None:
-        self.setup = [HudStyleConfig(function="reset", args=[{"task_id": self.id}])]
-        self.evaluate = [HudStyleConfig(function="evaluate", args=[])]

hud/taskset.py CHANGED Viewed

@@ -9,6 +9,8 @@ from hud.settings import settings
 from hud.task import Task
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from inspect_ai.dataset import Dataset
@@ -49,6 +51,12 @@ class TaskSet(BaseModel):
         """
         return len(self.tasks)
+    def __iter__(self) -> Iterator[Task]:
+        """
+        Returns an iterator over the tasks in the taskset.
+        """
+        return iter(self.tasks)
 async def load_taskset(taskset_id: str, api_key: str | None = None) -> TaskSet:
     """

hud/types.py CHANGED Viewed

@@ -44,9 +44,6 @@ class CustomGym(BaseModel):
             # Read the Dockerfile content
             self.dockerfile = dockerfile_path.read_text()
-# Strings are identifiers for gyms on the HUD server
-Gym = CustomGym | str
 class EnvironmentStatus(str, enum.Enum):
     """
     Status of the environment.
@@ -63,3 +60,8 @@ class EnvironmentStatus(str, enum.Enum):
     COMPLETED = "completed"
     ERROR = "error"
+# Available HUD gyms
+ServerGym = Literal["qa", "hud-browser", "hud-ubuntu", "OSWorld-Ubuntu"]
+# Gyms can be either custom or server-side
+Gym = CustomGym | ServerGym

hud/utils/common.py CHANGED Viewed

@@ -3,16 +3,46 @@ from __future__ import annotations
 import io
 import logging
 import tarfile
-from typing import TYPE_CHECKING, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
+from pydantic import BaseModel
 from hud.server.requests import make_request
 from hud.settings import settings
 if TYPE_CHECKING:
+    from collections.abc import Iterator
     from pathlib import Path
 logger = logging.getLogger("hud.utils.common")
+class HudStyleConfig(BaseModel):
+    function: str  # Format: "x.y.z"
+    args: list[Any] # Must be json serializable
+    id: str | None = None # Optional id for remote execution
+    def __len__(self) -> int:
+        return len(self.args)
+    def __getitem__(self, index: int) -> Any:
+        return self.args[index]
+    def __iter__(self) -> Iterator[Any]:
+        return iter(self.args)
+    def __str__(self) -> str:
+        return f"{self.function}: {', '.join(str(arg) for arg in self.args)}"
+# Type alias for the shorthand config, which just converts to function name and args
+ShorthandConfig = tuple[str | dict[str, Any] | list[str] | list[dict[str, Any]], ...]
+# Type alias for multiple config formats
+HudStyleConfigs = (
+    ShorthandConfig | HudStyleConfig | list[HudStyleConfig] | list[ShorthandConfig]
+    | dict[str, Any] | str
+)
 class ExecuteResult(TypedDict):
     """
     Result of an execute command.

hud-python 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

hud-python 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl