PyPI - hud-python - Versions diffs - 0.2.2__tar.gz → 0.2.3__tar.gz - Mend

hud-python 0.2.2tar.gz → 0.2.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (125) hide show

{hud_python-0.2.2 → hud_python-0.2.3}/.github/workflows/ci.yml RENAMED Viewed

@@ -4,7 +4,7 @@ on:
   push:
     branches: [ "main" ]
   pull_request:
-    branches: [ "main" ]
+    branches: [ "*" ]
 jobs:
   test:
@@ -24,7 +24,7 @@ jobs:
         run: uv python install ${{ matrix.python-version }}
       - name: Run tests
-        run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest
+        run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest --rootdir=hud --cov --cov-report=''
   lint-ruff:
     runs-on: ubuntu-latest
@@ -35,7 +35,7 @@ jobs:
       - name: Run ruff
         run: |
-          uv run --with=".[dev]" ruff format .
+          uv run --with=".[dev]" ruff format . --check
           uv run --with=".[dev]" ruff check .
   lint-pyright:

{hud_python-0.2.2 → hud_python-0.2.3}/.gitignore RENAMED Viewed

@@ -25,3 +25,5 @@ uv.lock
 /*.ipynb
 test.json
 TODO.md
+.coverage

{hud_python-0.2.2 → hud_python-0.2.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hud-python
-Version: 0.2.2
+Version: 0.2.3
 Summary: SDK for the HUD evaluation platform.
 Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
 Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -59,8 +59,11 @@ Requires-Dist: jupyter-client; extra == 'dev'
 Requires-Dist: jupyter-core; extra == 'dev'
 Requires-Dist: openai; extra == 'dev'
 Requires-Dist: pyright==1.1.364; extra == 'dev'
+Requires-Dist: pytest-asyncio; extra == 'dev'
+Requires-Dist: pytest-cov; extra == 'dev'
+Requires-Dist: pytest-mock; extra == 'dev'
 Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
-Requires-Dist: ruff==0.9.8; extra == 'dev'
+Requires-Dist: ruff==0.11.8; extra == 'dev'
 Description-Content-Type: text/markdown
 # HUD
@@ -94,17 +97,17 @@ pip install hud-python
 ### Simple Browser Example with Claude Computer Use
-> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
+> This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
 Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
 ```python
 import asyncio
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.agent import ClaudeAgent
-@job("test-run")
+@register_job("test-run")
 async def main():
     task = Task(
         prompt="Insert the text 'capybara' into the search bar",
@@ -192,7 +195,7 @@ If you use this SDK in your research, please cite it as follows:
 ```bibtex
 @software{hud2025agentevalplatform,
-  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
+  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
   title = {{HUD: An Evaluation Platform for Agents}},
   date = {2025-04},
   url = {https://github.com/hud-evals/hud-sdk},

{hud_python-0.2.2 → hud_python-0.2.3}/README.md RENAMED Viewed

@@ -29,17 +29,17 @@ pip install hud-python
 ### Simple Browser Example with Claude Computer Use
-> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
+> This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
 Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
 ```python
 import asyncio
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.agent import ClaudeAgent
-@job("test-run")
+@register_job("test-run")
 async def main():
     task = Task(
         prompt="Insert the text 'capybara' into the search bar",
@@ -127,7 +127,7 @@ If you use this SDK in your research, please cite it as follows:
 ```bibtex
 @software{hud2025agentevalplatform,
-  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
+  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
   title = {{HUD: An Evaluation Platform for Agents}},
   date = {2025-04},
   url = {https://github.com/hud-evals/hud-sdk},

{hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/environment-control.mdx RENAMED Viewed

@@ -12,11 +12,11 @@ While the standard `step`, `evaluate`, and `close` methods cover most interactio
 The `env._invoke_all()` method (and its underlying `client.invoke()`) is the core mechanism for calling specific functions *within* the environment's controller script.
 ```python
-async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
+async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
 ```
 *   **Purpose:** Execute custom functions defined in your environment controller (the Python code running inside the Docker container or remote instance). This is how `setup` and `evaluate` configurations in a `Task` are ultimately executed.
-*   **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `HudStyleConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
+*   **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `FunctionConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
 *   **When to Use:**
     *   Triggering custom evaluation logic not suitable for the standard `evaluate` attribute.
     *   Running specific diagnostic or state-setting functions within your custom environment controller during development or debugging.
@@ -71,7 +71,7 @@ print("Exit Code:", result['exit_code'])
 ## `_setup`
 ```python
-async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
+async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
 ```
 *   **Purpose:** Executes the setup configuration for the environment.

{hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/env.mdx RENAMED Viewed

@@ -25,11 +25,11 @@ class Environment(pydantic.BaseModel):
     ) -> tuple[Observation, float, bool, dict[str, Any]]: ...
     async def evaluate(
-        self, config: HudStyleConfigs | None = None
+        self, config: FunctionConfigs | None = None
     ) -> Any: ...
     async def reset(
-        self, configs: HudStyleConfigs | None = None
+        self, configs: FunctionConfigs | None = None
     ) -> tuple[Observation, dict[str, Any]]: ...
     async def get_urls(self) -> dict[str, Any]: ...
@@ -37,8 +37,8 @@ class Environment(pydantic.BaseModel):
     async def close(self) -> None: ...
     # Internal/Advanced Methods
-    # async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
-    # async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
+    # async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
+    # async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
 ```
 Represents a running instance (browser, OS) where an [Agent](/concepts/agent) interacts. Environments are typically created using `hud.gym.make()` rather than direct construction.
@@ -58,11 +58,11 @@ Represents a running instance (browser, OS) where an [Agent](/concepts/agent) in
     *   **Parameters:**
         *   `actions`: List of [CLA](/concepts/adapter) actions, or `None` to get initial observation.
     *   **Returns:** `(Observation, reward, terminated, info)` tuple. `reward` is typically 0 unless overridden by custom logic. `terminated` is typically `False`.
-*   **`evaluate(self, config: HudStyleConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
+*   **`evaluate(self, config: FunctionConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
     *   **Parameters:**
         *   `config`: Optional override for evaluation logic using [Configuration Styles](/concepts/task#configuration-styles).
     *   **Returns:** The result from the evaluation function(s).
-*   **`reset(self, configs: HudStyleConfigs | None = None)`:** Resets the environment state, usually running setup logic.
+*   **`reset(self, configs: FunctionConfigs | None = None)`:** Resets the environment state, usually running setup logic.
     *   **Parameters:**
         *   `configs`: Optional override for setup logic.
     *   **Returns:** `(Observation, info)` tuple after resetting. *(Note: `gym.make(task)` handles initial setup; direct `reset` is less common).*

{hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/gym.mdx RENAMED Viewed

@@ -20,7 +20,7 @@ async def make(
 Creates and initializes an [Environment](/concepts/environment) instance based on a specification.
-This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
+This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@register_job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
 **Parameters:**
@@ -28,7 +28,7 @@ This function handles selecting the correct client (local docker, remote docker,
     *   If a `str` (Gym ID like `"hud-browser"`, `"OSWorld-Ubuntu"`), creates a standard remote environment.
     *   If a `CustomGym` object, creates a custom environment based on its definition (local or remote docker).
     *   If a `Task` object, uses the `task.gym` attribute to determine the environment type and automatically runs `task.setup` after creation.
-*   **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@job` decorator.
+*   **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@register_job` decorator.
 *   **`metadata` (dict[str, Any] | None, optional):** Additional metadata to attach to the environment instance and its resulting trajectory.
 **Returns:**

{hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/job.mdx RENAMED Viewed

@@ -3,13 +3,13 @@ title: 'hud.job'
 description: 'API reference for Jobs and related functions/decorators'
 ---
-The `hud.job` module provides the `@job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
+The `hud.job` module provides the `@register_job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
 See the [Job Concepts](/concepts/job) page for explanations and usage examples.
 # Decorators
-## @job
+## @register_job
 ```python
 def job(
@@ -92,7 +92,7 @@ class Job(pydantic.BaseModel):
     ) -> list[Trajectory]: ...
 ```
-Represents a Job, typically obtained via `@job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
+Represents a Job, typically obtained via `@register_job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
 **Attributes:**

{hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/task.mdx RENAMED Viewed

@@ -13,8 +13,8 @@ The `hud.task` module provides the `Task` class for defining evaluation scenario
 class Task(pydantic.BaseModel):
     id: str | None = None
     prompt: str
-    setup: HudStyleConfigs | None = None
-    evaluate: HudStyleConfigs | None = None
+    setup: FunctionConfigs | None = None
+    evaluate: FunctionConfigs | None = None
     gym: Gym | None = None
     target: str | list[str] | None = None # Inspect compatibility
     choices: list[str] | None = None      # Inspect compatibility
@@ -33,8 +33,8 @@ See the [Tasks and TaskSets Concepts](/concepts/task) page for detailed explanat
 *   **`id` (str | None):** Optional unique identifier, often assigned when loaded from the HUD platform.
 *   **`prompt` (str):** The main instruction or goal for the agent.
-*   **`setup` (`HudStyleConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
-*   **`evaluate` (`HudStyleConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
+*   **`setup` (`FunctionConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
+*   **`evaluate` (`FunctionConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
 *   **`gym` (`Gym` | None):** Specifies the required environment type (e.g., `"hud-browser"`, `CustomGym` object). See `hud.types`.
 *   **`target` (str | list[str] | None):** Ideal target output (primarily for compatibility with `inspect-ai`).
 *   **`choices` (list[str] | None):** Multiple choice options (primarily for compatibility with `inspect-ai`).

{hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/environment.mdx RENAMED Viewed

@@ -46,7 +46,7 @@ env_os = await gym.make("OSWorld-Ubuntu")
 # await env_os.close()
 ```
-Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@job` decorator.
+Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@register_job` decorator.
 ## Available Environment Types
@@ -92,10 +92,10 @@ for _ in range(10):
 ## Key Methods
 *   **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
-*   **`env.evaluate(config: HudStyleConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
+*   **`env.evaluate(config: FunctionConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
 *   **`env.close()`**: Shuts down the environment. Saves the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
 *   **`env.get_urls()`**: Returns URLs (`url`, `live_url`) for accessing/viewing the environment.
-*   **`env.reset(configs: HudStyleConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
+*   **`env.reset(configs: FunctionConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
 *   **`env._setup(...)` / `env._invoke_all(...)`**: Internal methods for running setup/evaluate/custom configurations defined in a [Task](/concepts/task).
 ## Observations
@@ -110,5 +110,5 @@ The `Observation` object returned by `env.step()` contains:
 *   [Task](/concepts/task): Defines the environment type (`gym`), `setup`, and `evaluate` logic.
 *   [Agent](/concepts/agent): Interacts with the Environment via the `step` and `predict` methods.
 *   [Adapter](/concepts/adapter): Ensures actions passed to `step` are in the correct `CLA` format.
-*   [Job](/concepts/job): Groups environment runs; linking happens via `@job` or `gym.make(job=...)`.
+*   [Job](/concepts/job): Groups environment runs; linking happens via `@register_job` or `gym.make(job=...)`.
 *   [Trajectory](/concepts/trajectory): The recording generated when a job-linked environment is closed.

{hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/job.mdx RENAMED Viewed

@@ -18,16 +18,16 @@ Jobs help organize evaluation data, useful for:
 ## Creating Jobs
-### 1. The `@job` Decorator (Recommended)
+### 1. The `@register_job` Decorator (Recommended)
 Decorate an `async` function. A new Job is created per function call, and any environments created within using `hud.gym.make()` are automatically linked.
 ```python
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.agent import OperatorAgent # Example agent
-@job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
+@register_job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
 async def run_evaluation():
     task = Task(prompt="Example", gym="hud-browser")
     env = await gym.make(task) # Linked to "my-evaluation-run" job
@@ -89,7 +89,7 @@ async def analyze_job(job_id: str):
 ## Best Practices
-*   Use `@job` for most scripts.
+*   Use `@register_job` for most scripts.
 *   Use descriptive names and metadata.
 *   Create separate jobs for distinct experiments.

{hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/task.mdx RENAMED Viewed

@@ -15,8 +15,8 @@ A `Task` object provides the configuration for a specific scenario.
 *   **`prompt` (str):** The primary instruction given to the agent.
 *   **`gym` (str | `CustomGym` | None):** Specifies the type of [Environment](/concepts/environment) needed. Used by `hud.gym.make()`.
-*   **`setup` (`HudStyleConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
-*   **`evaluate` (`HudStyleConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
+*   **`setup` (`FunctionConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
+*   **`evaluate` (`FunctionConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
 *   **`id` (str | None):** Optional identifier.
 *   **`metadata` (dict | None):** Optional dictionary for extra information.
 *   **`config` (dict | None):** Optional dictionary, primarily for remote execution.
@@ -41,7 +41,7 @@ task = Task(
 ### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
-Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`HudStyleConfigs`):
+Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`FunctionConfigs`):
 1.  **String:** `"browser.maximize"`
 2.  **Tuple:** `("goto", "https://google.com")`

{hud_python-0.2.2 → hud_python-0.2.3}/docs/quickstart.mdx RENAMED Viewed

@@ -43,13 +43,13 @@ This example uses the `OperatorAgent` to interact with a browser environment. It
 ```python
 import asyncio
 import os
-from hud import gym, job                  # Import gym for environments and job decorator
+from hud import gym, register_job                  # Import gym for environments and job decorator
 from hud.task import Task                 # Import Task to define the goal
 from hud.agent import OperatorAgent       # Import the agent
 # hud.settings automatically loads keys from .env or environment variables
 # Decorator to group this run under a job named "quickstart-run"
-@job("quickstart-run")
+@register_job("quickstart-run")
 async def main():
     # 1. Define a Task: What should the agent do?
     task = Task(
@@ -111,7 +111,7 @@ if __name__ == "__main__":
     *   `agent.predict(obs)` gets the next action(s) from the agent.
     *   `env.step(actions)` executes the actions and gets the new observation.
 5.  **Evaluation & Close:** `env.evaluate()` checks if the task succeeded based on the `evaluate` definition. `env.close()` shuts down the environment.
-6.  **`@job` Decorator:** Wrapping `main` with `@job("quickstart-run")` automatically creates a Job. When `env.close()` is called, the recorded interactions (trajectory) are associated with this Job. You can view the job and its trajectory video on the [HUD Jobs page](https://app.hud.so/jobs).
+6.  **`@register_job` Decorator:** Wrapping `main` with `@register_job("quickstart-run")` automatically creates a Job. When `env.close()` is called, the recorded interactions (trajectory) are associated with this Job. You can view the job and its trajectory video on the [HUD Jobs page](https://app.hud.so/jobs).
 ## Next Steps

{hud_python-0.2.2 → hud_python-0.2.3}/docs/running-your-agent.mdx RENAMED Viewed

@@ -29,7 +29,7 @@ This is the most straightforward approach if your agent logic can directly gener
 ```python
 import asyncio
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.env import Observation
 # Import specific CLA types you need
@@ -48,7 +48,7 @@ def my_custom_agent_logic(observation: Observation) -> list[CLA]:
     # Ensure the return type is list[CLA]
     return actions
-@job("custom-cla-agent-run")
+@register_job("custom-cla-agent-run")
 async def main():
     task = Task(prompt="Click and type", gym="hud-browser")
     env = await gym.make(task)
@@ -90,7 +90,7 @@ This approach leverages the SDK's structure for a more integrated solution.
 ```python
 import asyncio
 from typing import Any # Placeholder for your raw action type
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.env import Observation
 from hud.agent import Agent # Import base class
@@ -136,7 +136,7 @@ class MyAdapter(Adapter):
         raise ValueError(f"Unknown raw action type: {raw_action}")
 # --- Usage ---
-@job("custom-agent-framework-run")
+@register_job("custom-agent-framework-run")
 async def main():
     task = Task(prompt="Use custom agent", gym="hud-browser")
     env = await gym.make(task)
@@ -181,13 +181,13 @@ This approach uses HUD primarily for environment provisioning and lifecycle mana
 ```python
 import asyncio
 import os
-from hud import gym, job
+from hud import gym, register_job
 from hud.task import Task
 from hud.utils import stream # For live view
 # Need external library, e.g., pyppeteer (pip install pyppeteer)
 # import pyppeteer
-@job("external-control-run")
+@register_job("external-control-run")
 async def main():
     task = Task(prompt="Externally controlled task", gym="hud-browser", setup=("goto", "google.com"))
     env = await gym.make(task)

{hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 """Initialize the local-qa environment package."""
 from __future__ import annotations
 from .step import step

hud-python 0.2.2__tar.gz → 0.2.3__tar.gz

hud-python 0.2.2tar.gz → 0.2.3tar.gz