hud-python 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hud_python-0.2.2 → hud_python-0.2.3}/.github/workflows/ci.yml +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/.gitignore +2 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/PKG-INFO +9 -6
- {hud_python-0.2.2 → hud_python-0.2.3}/README.md +4 -4
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/environment-control.mdx +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/env.mdx +6 -6
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/gym.mdx +2 -2
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/job.mdx +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/task.mdx +4 -4
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/environment.mdx +4 -4
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/job.mdx +4 -4
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/task.mdx +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/quickstart.mdx +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/running-your-agent.mdx +6 -6
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +1 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +76 -63
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -1
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/browser_use.ipynb +5 -12
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/ds_upload.ipynb +39 -36
- hud_python-0.2.3/examples/example.ipynb +86 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/inspect.ipynb +9 -13
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/jobs.ipynb +4 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/local.ipynb +7 -10
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/osworld.ipynb +32 -30
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/tasks.ipynb +12 -15
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/__init__.py +4 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/claude/adapter.py +5 -14
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/adapter.py +3 -3
- hud_python-0.2.3/hud/adapters/common/tests/test_adapter.py +277 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/types.py +3 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/operator/adapter.py +16 -23
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/__init__.py +8 -1
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/base.py +28 -28
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/claude.py +69 -60
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/langchain.py +32 -26
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/operator.py +75 -67
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/__init__.py +5 -5
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/client.py +2 -2
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/docker_client.py +37 -39
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/environment.py +91 -66
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/local_docker_client.py +5 -7
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/remote_client.py +39 -32
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/remote_docker_client.py +13 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/__init__.py +2 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/base.py +4 -3
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/inspect.py +3 -8
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/judge.py +34 -58
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/match.py +42 -49
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/remote.py +13 -26
- hud_python-0.2.3/hud/evaluators/tests/test_inspect.py +12 -0
- hud_python-0.2.3/hud/evaluators/tests/test_judge.py +231 -0
- hud_python-0.2.3/hud/evaluators/tests/test_match.py +115 -0
- hud_python-0.2.3/hud/evaluators/tests/test_remote.py +98 -0
- hud_python-0.2.3/hud/exceptions.py +167 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/gym.py +9 -7
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/job.py +179 -109
- hud_python-0.2.3/hud/py.typed +0 -0
- hud_python-0.2.3/hud/server/__init__.py +5 -0
- hud_python-0.2.3/hud/server/requests.py +242 -0
- hud_python-0.2.3/hud/server/tests/__init__.py +0 -0
- hud_python-0.2.3/hud/server/tests/test_requests.py +275 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/settings.py +3 -2
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/task.py +9 -19
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/taskset.py +44 -11
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/trajectory.py +6 -9
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/types.py +12 -9
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/__init__.py +2 -2
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/common.py +36 -15
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/config.py +45 -30
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/progress.py +34 -21
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/telemetry.py +10 -11
- hud_python-0.2.3/hud/utils/tests/__init__.py +0 -0
- hud_python-0.2.3/hud/utils/tests/test_common.py +52 -0
- hud_python-0.2.3/hud/utils/tests/test_config.py +129 -0
- hud_python-0.2.3/hud/utils/tests/test_progress.py +225 -0
- hud_python-0.2.3/hud/utils/tests/test_telemetry.py +37 -0
- hud_python-0.2.2/tests/test_import.py → hud_python-0.2.3/hud/utils/tests/test_version.py +2 -1
- {hud_python-0.2.2 → hud_python-0.2.3}/pyproject.toml +26 -2
- hud_python-0.2.2/hud/server/__init__.py +0 -5
- hud_python-0.2.2/hud/server/requests.py +0 -280
- {hud_python-0.2.2 → hud_python-0.2.3}/.env.example +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/.github/workflows/release.yml +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/LICENSE +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/MANIFEST.in +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/cla-details.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/custom-environments.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api/reference/adapters.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/taskset.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/trajectory.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/agent.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/trajectory.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/docs.json +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/hud-browser.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/hud-ubuntu.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/qa.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/basic.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/claude-agent.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/custom-agent.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/favicon.png +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/installation.mdx +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/logo/HUD-light-optimized.svg +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/docs/logo/HUD.svg +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/Dockerfile +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/pyproject.toml +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/Dockerfile +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/pyproject.toml +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/info.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/step.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/README.md +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/examples/WebVoyager_data.jsonl +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3/hud/adapters/common}/tests/__init__.py +0 -0
- {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/operator/__init__.py +0 -0
- /hud_python-0.2.2/hud/py.typed → /hud_python-0.2.3/hud/evaluators/tests/__init__.py +0 -0
|
@@ -4,7 +4,7 @@ on:
|
|
|
4
4
|
push:
|
|
5
5
|
branches: [ "main" ]
|
|
6
6
|
pull_request:
|
|
7
|
-
branches: [ "
|
|
7
|
+
branches: [ "*" ]
|
|
8
8
|
|
|
9
9
|
jobs:
|
|
10
10
|
test:
|
|
@@ -24,7 +24,7 @@ jobs:
|
|
|
24
24
|
run: uv python install ${{ matrix.python-version }}
|
|
25
25
|
|
|
26
26
|
- name: Run tests
|
|
27
|
-
run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest
|
|
27
|
+
run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest --rootdir=hud --cov --cov-report=''
|
|
28
28
|
|
|
29
29
|
lint-ruff:
|
|
30
30
|
runs-on: ubuntu-latest
|
|
@@ -35,7 +35,7 @@ jobs:
|
|
|
35
35
|
|
|
36
36
|
- name: Run ruff
|
|
37
37
|
run: |
|
|
38
|
-
uv run --with=".[dev]" ruff format .
|
|
38
|
+
uv run --with=".[dev]" ruff format . --check
|
|
39
39
|
uv run --with=".[dev]" ruff check .
|
|
40
40
|
|
|
41
41
|
lint-pyright:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -59,8 +59,11 @@ Requires-Dist: jupyter-client; extra == 'dev'
|
|
|
59
59
|
Requires-Dist: jupyter-core; extra == 'dev'
|
|
60
60
|
Requires-Dist: openai; extra == 'dev'
|
|
61
61
|
Requires-Dist: pyright==1.1.364; extra == 'dev'
|
|
62
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
63
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
64
|
+
Requires-Dist: pytest-mock; extra == 'dev'
|
|
62
65
|
Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
63
|
-
Requires-Dist: ruff==0.
|
|
66
|
+
Requires-Dist: ruff==0.11.8; extra == 'dev'
|
|
64
67
|
Description-Content-Type: text/markdown
|
|
65
68
|
|
|
66
69
|
# HUD
|
|
@@ -94,17 +97,17 @@ pip install hud-python
|
|
|
94
97
|
|
|
95
98
|
### Simple Browser Example with Claude Computer Use
|
|
96
99
|
|
|
97
|
-
> This example uses the `@
|
|
100
|
+
> This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
98
101
|
|
|
99
102
|
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
100
103
|
|
|
101
104
|
```python
|
|
102
105
|
import asyncio
|
|
103
|
-
from hud import gym,
|
|
106
|
+
from hud import gym, register_job
|
|
104
107
|
from hud.task import Task
|
|
105
108
|
from hud.agent import ClaudeAgent
|
|
106
109
|
|
|
107
|
-
@
|
|
110
|
+
@register_job("test-run")
|
|
108
111
|
async def main():
|
|
109
112
|
task = Task(
|
|
110
113
|
prompt="Insert the text 'capybara' into the search bar",
|
|
@@ -192,7 +195,7 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
192
195
|
|
|
193
196
|
```bibtex
|
|
194
197
|
@software{hud2025agentevalplatform,
|
|
195
|
-
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and
|
|
198
|
+
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
196
199
|
title = {{HUD: An Evaluation Platform for Agents}},
|
|
197
200
|
date = {2025-04},
|
|
198
201
|
url = {https://github.com/hud-evals/hud-sdk},
|
|
@@ -29,17 +29,17 @@ pip install hud-python
|
|
|
29
29
|
|
|
30
30
|
### Simple Browser Example with Claude Computer Use
|
|
31
31
|
|
|
32
|
-
> This example uses the `@
|
|
32
|
+
> This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
33
33
|
|
|
34
34
|
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
35
35
|
|
|
36
36
|
```python
|
|
37
37
|
import asyncio
|
|
38
|
-
from hud import gym,
|
|
38
|
+
from hud import gym, register_job
|
|
39
39
|
from hud.task import Task
|
|
40
40
|
from hud.agent import ClaudeAgent
|
|
41
41
|
|
|
42
|
-
@
|
|
42
|
+
@register_job("test-run")
|
|
43
43
|
async def main():
|
|
44
44
|
task = Task(
|
|
45
45
|
prompt="Insert the text 'capybara' into the search bar",
|
|
@@ -127,7 +127,7 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
127
127
|
|
|
128
128
|
```bibtex
|
|
129
129
|
@software{hud2025agentevalplatform,
|
|
130
|
-
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and
|
|
130
|
+
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
131
131
|
title = {{HUD: An Evaluation Platform for Agents}},
|
|
132
132
|
date = {2025-04},
|
|
133
133
|
url = {https://github.com/hud-evals/hud-sdk},
|
|
@@ -12,11 +12,11 @@ While the standard `step`, `evaluate`, and `close` methods cover most interactio
|
|
|
12
12
|
The `env._invoke_all()` method (and its underlying `client.invoke()`) is the core mechanism for calling specific functions *within* the environment's controller script.
|
|
13
13
|
|
|
14
14
|
```python
|
|
15
|
-
async def _invoke_all(self, configs:
|
|
15
|
+
async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
|
|
16
16
|
```
|
|
17
17
|
|
|
18
18
|
* **Purpose:** Execute custom functions defined in your environment controller (the Python code running inside the Docker container or remote instance). This is how `setup` and `evaluate` configurations in a `Task` are ultimately executed.
|
|
19
|
-
* **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `
|
|
19
|
+
* **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `FunctionConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
|
|
20
20
|
* **When to Use:**
|
|
21
21
|
* Triggering custom evaluation logic not suitable for the standard `evaluate` attribute.
|
|
22
22
|
* Running specific diagnostic or state-setting functions within your custom environment controller during development or debugging.
|
|
@@ -71,7 +71,7 @@ print("Exit Code:", result['exit_code'])
|
|
|
71
71
|
## `_setup`
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
|
-
async def _setup(self, config:
|
|
74
|
+
async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
* **Purpose:** Executes the setup configuration for the environment.
|
|
@@ -25,11 +25,11 @@ class Environment(pydantic.BaseModel):
|
|
|
25
25
|
) -> tuple[Observation, float, bool, dict[str, Any]]: ...
|
|
26
26
|
|
|
27
27
|
async def evaluate(
|
|
28
|
-
self, config:
|
|
28
|
+
self, config: FunctionConfigs | None = None
|
|
29
29
|
) -> Any: ...
|
|
30
30
|
|
|
31
31
|
async def reset(
|
|
32
|
-
self, configs:
|
|
32
|
+
self, configs: FunctionConfigs | None = None
|
|
33
33
|
) -> tuple[Observation, dict[str, Any]]: ...
|
|
34
34
|
|
|
35
35
|
async def get_urls(self) -> dict[str, Any]: ...
|
|
@@ -37,8 +37,8 @@ class Environment(pydantic.BaseModel):
|
|
|
37
37
|
async def close(self) -> None: ...
|
|
38
38
|
|
|
39
39
|
# Internal/Advanced Methods
|
|
40
|
-
# async def _setup(self, config:
|
|
41
|
-
# async def _invoke_all(self, configs:
|
|
40
|
+
# async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
|
|
41
|
+
# async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
|
|
42
42
|
```
|
|
43
43
|
|
|
44
44
|
Represents a running instance (browser, OS) where an [Agent](/concepts/agent) interacts. Environments are typically created using `hud.gym.make()` rather than direct construction.
|
|
@@ -58,11 +58,11 @@ Represents a running instance (browser, OS) where an [Agent](/concepts/agent) in
|
|
|
58
58
|
* **Parameters:**
|
|
59
59
|
* `actions`: List of [CLA](/concepts/adapter) actions, or `None` to get initial observation.
|
|
60
60
|
* **Returns:** `(Observation, reward, terminated, info)` tuple. `reward` is typically 0 unless overridden by custom logic. `terminated` is typically `False`.
|
|
61
|
-
* **`evaluate(self, config:
|
|
61
|
+
* **`evaluate(self, config: FunctionConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
|
|
62
62
|
* **Parameters:**
|
|
63
63
|
* `config`: Optional override for evaluation logic using [Configuration Styles](/concepts/task#configuration-styles).
|
|
64
64
|
* **Returns:** The result from the evaluation function(s).
|
|
65
|
-
* **`reset(self, configs:
|
|
65
|
+
* **`reset(self, configs: FunctionConfigs | None = None)`:** Resets the environment state, usually running setup logic.
|
|
66
66
|
* **Parameters:**
|
|
67
67
|
* `configs`: Optional override for setup logic.
|
|
68
68
|
* **Returns:** `(Observation, info)` tuple after resetting. *(Note: `gym.make(task)` handles initial setup; direct `reset` is less common).*
|
|
@@ -20,7 +20,7 @@ async def make(
|
|
|
20
20
|
|
|
21
21
|
Creates and initializes an [Environment](/concepts/environment) instance based on a specification.
|
|
22
22
|
|
|
23
|
-
This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@
|
|
23
|
+
This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@register_job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
|
|
24
24
|
|
|
25
25
|
**Parameters:**
|
|
26
26
|
|
|
@@ -28,7 +28,7 @@ This function handles selecting the correct client (local docker, remote docker,
|
|
|
28
28
|
* If a `str` (Gym ID like `"hud-browser"`, `"OSWorld-Ubuntu"`), creates a standard remote environment.
|
|
29
29
|
* If a `CustomGym` object, creates a custom environment based on its definition (local or remote docker).
|
|
30
30
|
* If a `Task` object, uses the `task.gym` attribute to determine the environment type and automatically runs `task.setup` after creation.
|
|
31
|
-
* **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@
|
|
31
|
+
* **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@register_job` decorator.
|
|
32
32
|
* **`metadata` (dict[str, Any] | None, optional):** Additional metadata to attach to the environment instance and its resulting trajectory.
|
|
33
33
|
|
|
34
34
|
**Returns:**
|
|
@@ -3,13 +3,13 @@ title: 'hud.job'
|
|
|
3
3
|
description: 'API reference for Jobs and related functions/decorators'
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
The `hud.job` module provides the `@
|
|
6
|
+
The `hud.job` module provides the `@register_job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
|
|
7
7
|
|
|
8
8
|
See the [Job Concepts](/concepts/job) page for explanations and usage examples.
|
|
9
9
|
|
|
10
10
|
# Decorators
|
|
11
11
|
|
|
12
|
-
## @
|
|
12
|
+
## @register_job
|
|
13
13
|
|
|
14
14
|
```python
|
|
15
15
|
def job(
|
|
@@ -92,7 +92,7 @@ class Job(pydantic.BaseModel):
|
|
|
92
92
|
) -> list[Trajectory]: ...
|
|
93
93
|
```
|
|
94
94
|
|
|
95
|
-
Represents a Job, typically obtained via `@
|
|
95
|
+
Represents a Job, typically obtained via `@register_job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
|
|
96
96
|
|
|
97
97
|
**Attributes:**
|
|
98
98
|
|
|
@@ -13,8 +13,8 @@ The `hud.task` module provides the `Task` class for defining evaluation scenario
|
|
|
13
13
|
class Task(pydantic.BaseModel):
|
|
14
14
|
id: str | None = None
|
|
15
15
|
prompt: str
|
|
16
|
-
setup:
|
|
17
|
-
evaluate:
|
|
16
|
+
setup: FunctionConfigs | None = None
|
|
17
|
+
evaluate: FunctionConfigs | None = None
|
|
18
18
|
gym: Gym | None = None
|
|
19
19
|
target: str | list[str] | None = None # Inspect compatibility
|
|
20
20
|
choices: list[str] | None = None # Inspect compatibility
|
|
@@ -33,8 +33,8 @@ See the [Tasks and TaskSets Concepts](/concepts/task) page for detailed explanat
|
|
|
33
33
|
|
|
34
34
|
* **`id` (str | None):** Optional unique identifier, often assigned when loaded from the HUD platform.
|
|
35
35
|
* **`prompt` (str):** The main instruction or goal for the agent.
|
|
36
|
-
* **`setup` (`
|
|
37
|
-
* **`evaluate` (`
|
|
36
|
+
* **`setup` (`FunctionConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
|
|
37
|
+
* **`evaluate` (`FunctionConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
|
|
38
38
|
* **`gym` (`Gym` | None):** Specifies the required environment type (e.g., `"hud-browser"`, `CustomGym` object). See `hud.types`.
|
|
39
39
|
* **`target` (str | list[str] | None):** Ideal target output (primarily for compatibility with `inspect-ai`).
|
|
40
40
|
* **`choices` (list[str] | None):** Multiple choice options (primarily for compatibility with `inspect-ai`).
|
|
@@ -46,7 +46,7 @@ env_os = await gym.make("OSWorld-Ubuntu")
|
|
|
46
46
|
# await env_os.close()
|
|
47
47
|
```
|
|
48
48
|
|
|
49
|
-
Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@
|
|
49
|
+
Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@register_job` decorator.
|
|
50
50
|
|
|
51
51
|
## Available Environment Types
|
|
52
52
|
|
|
@@ -92,10 +92,10 @@ for _ in range(10):
|
|
|
92
92
|
## Key Methods
|
|
93
93
|
|
|
94
94
|
* **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
|
|
95
|
-
* **`env.evaluate(config:
|
|
95
|
+
* **`env.evaluate(config: FunctionConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
|
|
96
96
|
* **`env.close()`**: Shuts down the environment. Saves the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
|
|
97
97
|
* **`env.get_urls()`**: Returns URLs (`url`, `live_url`) for accessing/viewing the environment.
|
|
98
|
-
* **`env.reset(configs:
|
|
98
|
+
* **`env.reset(configs: FunctionConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
|
|
99
99
|
* **`env._setup(...)` / `env._invoke_all(...)`**: Internal methods for running setup/evaluate/custom configurations defined in a [Task](/concepts/task).
|
|
100
100
|
|
|
101
101
|
## Observations
|
|
@@ -110,5 +110,5 @@ The `Observation` object returned by `env.step()` contains:
|
|
|
110
110
|
* [Task](/concepts/task): Defines the environment type (`gym`), `setup`, and `evaluate` logic.
|
|
111
111
|
* [Agent](/concepts/agent): Interacts with the Environment via the `step` and `predict` methods.
|
|
112
112
|
* [Adapter](/concepts/adapter): Ensures actions passed to `step` are in the correct `CLA` format.
|
|
113
|
-
* [Job](/concepts/job): Groups environment runs; linking happens via `@
|
|
113
|
+
* [Job](/concepts/job): Groups environment runs; linking happens via `@register_job` or `gym.make(job=...)`.
|
|
114
114
|
* [Trajectory](/concepts/trajectory): The recording generated when a job-linked environment is closed.
|
|
@@ -18,16 +18,16 @@ Jobs help organize evaluation data, useful for:
|
|
|
18
18
|
|
|
19
19
|
## Creating Jobs
|
|
20
20
|
|
|
21
|
-
### 1. The `@
|
|
21
|
+
### 1. The `@register_job` Decorator (Recommended)
|
|
22
22
|
|
|
23
23
|
Decorate an `async` function. A new Job is created per function call, and any environments created within using `hud.gym.make()` are automatically linked.
|
|
24
24
|
|
|
25
25
|
```python
|
|
26
|
-
from hud import gym,
|
|
26
|
+
from hud import gym, register_job
|
|
27
27
|
from hud.task import Task
|
|
28
28
|
from hud.agent import OperatorAgent # Example agent
|
|
29
29
|
|
|
30
|
-
@
|
|
30
|
+
@register_job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
|
|
31
31
|
async def run_evaluation():
|
|
32
32
|
task = Task(prompt="Example", gym="hud-browser")
|
|
33
33
|
env = await gym.make(task) # Linked to "my-evaluation-run" job
|
|
@@ -89,7 +89,7 @@ async def analyze_job(job_id: str):
|
|
|
89
89
|
|
|
90
90
|
## Best Practices
|
|
91
91
|
|
|
92
|
-
* Use `@
|
|
92
|
+
* Use `@register_job` for most scripts.
|
|
93
93
|
* Use descriptive names and metadata.
|
|
94
94
|
* Create separate jobs for distinct experiments.
|
|
95
95
|
|
|
@@ -15,8 +15,8 @@ A `Task` object provides the configuration for a specific scenario.
|
|
|
15
15
|
|
|
16
16
|
* **`prompt` (str):** The primary instruction given to the agent.
|
|
17
17
|
* **`gym` (str | `CustomGym` | None):** Specifies the type of [Environment](/concepts/environment) needed. Used by `hud.gym.make()`.
|
|
18
|
-
* **`setup` (`
|
|
19
|
-
* **`evaluate` (`
|
|
18
|
+
* **`setup` (`FunctionConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
|
|
19
|
+
* **`evaluate` (`FunctionConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
|
|
20
20
|
* **`id` (str | None):** Optional identifier.
|
|
21
21
|
* **`metadata` (dict | None):** Optional dictionary for extra information.
|
|
22
22
|
* **`config` (dict | None):** Optional dictionary, primarily for remote execution.
|
|
@@ -41,7 +41,7 @@ task = Task(
|
|
|
41
41
|
|
|
42
42
|
### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
|
|
43
43
|
|
|
44
|
-
Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`
|
|
44
|
+
Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`FunctionConfigs`):
|
|
45
45
|
|
|
46
46
|
1. **String:** `"browser.maximize"`
|
|
47
47
|
2. **Tuple:** `("goto", "https://google.com")`
|
|
@@ -43,13 +43,13 @@ This example uses the `OperatorAgent` to interact with a browser environment. It
|
|
|
43
43
|
```python
|
|
44
44
|
import asyncio
|
|
45
45
|
import os
|
|
46
|
-
from hud import gym,
|
|
46
|
+
from hud import gym, register_job # Import gym for environments and job decorator
|
|
47
47
|
from hud.task import Task # Import Task to define the goal
|
|
48
48
|
from hud.agent import OperatorAgent # Import the agent
|
|
49
49
|
# hud.settings automatically loads keys from .env or environment variables
|
|
50
50
|
|
|
51
51
|
# Decorator to group this run under a job named "quickstart-run"
|
|
52
|
-
@
|
|
52
|
+
@register_job("quickstart-run")
|
|
53
53
|
async def main():
|
|
54
54
|
# 1. Define a Task: What should the agent do?
|
|
55
55
|
task = Task(
|
|
@@ -111,7 +111,7 @@ if __name__ == "__main__":
|
|
|
111
111
|
* `agent.predict(obs)` gets the next action(s) from the agent.
|
|
112
112
|
* `env.step(actions)` executes the actions and gets the new observation.
|
|
113
113
|
5. **Evaluation & Close:** `env.evaluate()` checks if the task succeeded based on the `evaluate` definition. `env.close()` shuts down the environment.
|
|
114
|
-
6. **`@
|
|
114
|
+
6. **`@register_job` Decorator:** Wrapping `main` with `@register_job("quickstart-run")` automatically creates a Job. When `env.close()` is called, the recorded interactions (trajectory) are associated with this Job. You can view the job and its trajectory video on the [HUD Jobs page](https://app.hud.so/jobs).
|
|
115
115
|
|
|
116
116
|
## Next Steps
|
|
117
117
|
|
|
@@ -29,7 +29,7 @@ This is the most straightforward approach if your agent logic can directly gener
|
|
|
29
29
|
|
|
30
30
|
```python
|
|
31
31
|
import asyncio
|
|
32
|
-
from hud import gym,
|
|
32
|
+
from hud import gym, register_job
|
|
33
33
|
from hud.task import Task
|
|
34
34
|
from hud.env import Observation
|
|
35
35
|
# Import specific CLA types you need
|
|
@@ -48,7 +48,7 @@ def my_custom_agent_logic(observation: Observation) -> list[CLA]:
|
|
|
48
48
|
# Ensure the return type is list[CLA]
|
|
49
49
|
return actions
|
|
50
50
|
|
|
51
|
-
@
|
|
51
|
+
@register_job("custom-cla-agent-run")
|
|
52
52
|
async def main():
|
|
53
53
|
task = Task(prompt="Click and type", gym="hud-browser")
|
|
54
54
|
env = await gym.make(task)
|
|
@@ -90,7 +90,7 @@ This approach leverages the SDK's structure for a more integrated solution.
|
|
|
90
90
|
```python
|
|
91
91
|
import asyncio
|
|
92
92
|
from typing import Any # Placeholder for your raw action type
|
|
93
|
-
from hud import gym,
|
|
93
|
+
from hud import gym, register_job
|
|
94
94
|
from hud.task import Task
|
|
95
95
|
from hud.env import Observation
|
|
96
96
|
from hud.agent import Agent # Import base class
|
|
@@ -136,7 +136,7 @@ class MyAdapter(Adapter):
|
|
|
136
136
|
raise ValueError(f"Unknown raw action type: {raw_action}")
|
|
137
137
|
|
|
138
138
|
# --- Usage ---
|
|
139
|
-
@
|
|
139
|
+
@register_job("custom-agent-framework-run")
|
|
140
140
|
async def main():
|
|
141
141
|
task = Task(prompt="Use custom agent", gym="hud-browser")
|
|
142
142
|
env = await gym.make(task)
|
|
@@ -181,13 +181,13 @@ This approach uses HUD primarily for environment provisioning and lifecycle mana
|
|
|
181
181
|
```python
|
|
182
182
|
import asyncio
|
|
183
183
|
import os
|
|
184
|
-
from hud import gym,
|
|
184
|
+
from hud import gym, register_job
|
|
185
185
|
from hud.task import Task
|
|
186
186
|
from hud.utils import stream # For live view
|
|
187
187
|
# Need external library, e.g., pyppeteer (pip install pyppeteer)
|
|
188
188
|
# import pyppeteer
|
|
189
189
|
|
|
190
|
-
@
|
|
190
|
+
@register_job("external-control-run")
|
|
191
191
|
async def main():
|
|
192
192
|
task = Task(prompt="Externally controlled task", gym="hud-browser", setup=("goto", "google.com"))
|
|
193
193
|
env = await gym.make(task)
|