hud-python 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {hud_python-0.2.2 → hud_python-0.2.3}/.github/workflows/ci.yml +3 -3
  2. {hud_python-0.2.2 → hud_python-0.2.3}/.gitignore +2 -0
  3. {hud_python-0.2.2 → hud_python-0.2.3}/PKG-INFO +9 -6
  4. {hud_python-0.2.2 → hud_python-0.2.3}/README.md +4 -4
  5. {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/environment-control.mdx +3 -3
  6. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/env.mdx +6 -6
  7. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/gym.mdx +2 -2
  8. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/job.mdx +3 -3
  9. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/task.mdx +4 -4
  10. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/environment.mdx +4 -4
  11. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/job.mdx +4 -4
  12. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/task.mdx +3 -3
  13. {hud_python-0.2.2 → hud_python-0.2.3}/docs/quickstart.mdx +3 -3
  14. {hud_python-0.2.2 → hud_python-0.2.3}/docs/running-your-agent.mdx +6 -6
  15. {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +1 -0
  16. {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +76 -63
  17. {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -1
  18. {hud_python-0.2.2 → hud_python-0.2.3}/examples/browser_use.ipynb +5 -12
  19. {hud_python-0.2.2 → hud_python-0.2.3}/examples/ds_upload.ipynb +39 -36
  20. hud_python-0.2.3/examples/example.ipynb +86 -0
  21. {hud_python-0.2.2 → hud_python-0.2.3}/examples/inspect.ipynb +9 -13
  22. {hud_python-0.2.2 → hud_python-0.2.3}/examples/jobs.ipynb +4 -3
  23. {hud_python-0.2.2 → hud_python-0.2.3}/examples/local.ipynb +7 -10
  24. {hud_python-0.2.2 → hud_python-0.2.3}/examples/osworld.ipynb +32 -30
  25. {hud_python-0.2.2 → hud_python-0.2.3}/examples/tasks.ipynb +12 -15
  26. {hud_python-0.2.2 → hud_python-0.2.3}/hud/__init__.py +4 -3
  27. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/claude/adapter.py +5 -14
  28. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/adapter.py +3 -3
  29. hud_python-0.2.3/hud/adapters/common/tests/test_adapter.py +277 -0
  30. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/types.py +3 -3
  31. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/operator/adapter.py +16 -23
  32. {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/__init__.py +8 -1
  33. {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/base.py +28 -28
  34. {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/claude.py +69 -60
  35. {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/langchain.py +32 -26
  36. {hud_python-0.2.2 → hud_python-0.2.3}/hud/agent/operator.py +75 -67
  37. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/__init__.py +5 -5
  38. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/client.py +2 -2
  39. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/docker_client.py +37 -39
  40. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/environment.py +91 -66
  41. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/local_docker_client.py +5 -7
  42. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/remote_client.py +39 -32
  43. {hud_python-0.2.2 → hud_python-0.2.3}/hud/env/remote_docker_client.py +13 -3
  44. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/__init__.py +2 -3
  45. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/base.py +4 -3
  46. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/inspect.py +3 -8
  47. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/judge.py +34 -58
  48. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/match.py +42 -49
  49. {hud_python-0.2.2 → hud_python-0.2.3}/hud/evaluators/remote.py +13 -26
  50. hud_python-0.2.3/hud/evaluators/tests/test_inspect.py +12 -0
  51. hud_python-0.2.3/hud/evaluators/tests/test_judge.py +231 -0
  52. hud_python-0.2.3/hud/evaluators/tests/test_match.py +115 -0
  53. hud_python-0.2.3/hud/evaluators/tests/test_remote.py +98 -0
  54. hud_python-0.2.3/hud/exceptions.py +167 -0
  55. {hud_python-0.2.2 → hud_python-0.2.3}/hud/gym.py +9 -7
  56. {hud_python-0.2.2 → hud_python-0.2.3}/hud/job.py +179 -109
  57. hud_python-0.2.3/hud/py.typed +0 -0
  58. hud_python-0.2.3/hud/server/__init__.py +5 -0
  59. hud_python-0.2.3/hud/server/requests.py +242 -0
  60. hud_python-0.2.3/hud/server/tests/__init__.py +0 -0
  61. hud_python-0.2.3/hud/server/tests/test_requests.py +275 -0
  62. {hud_python-0.2.2 → hud_python-0.2.3}/hud/settings.py +3 -2
  63. {hud_python-0.2.2 → hud_python-0.2.3}/hud/task.py +9 -19
  64. {hud_python-0.2.2 → hud_python-0.2.3}/hud/taskset.py +44 -11
  65. {hud_python-0.2.2 → hud_python-0.2.3}/hud/trajectory.py +6 -9
  66. {hud_python-0.2.2 → hud_python-0.2.3}/hud/types.py +12 -9
  67. {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/__init__.py +2 -2
  68. {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/common.py +36 -15
  69. {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/config.py +45 -30
  70. {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/progress.py +34 -21
  71. {hud_python-0.2.2 → hud_python-0.2.3}/hud/utils/telemetry.py +10 -11
  72. hud_python-0.2.3/hud/utils/tests/__init__.py +0 -0
  73. hud_python-0.2.3/hud/utils/tests/test_common.py +52 -0
  74. hud_python-0.2.3/hud/utils/tests/test_config.py +129 -0
  75. hud_python-0.2.3/hud/utils/tests/test_progress.py +225 -0
  76. hud_python-0.2.3/hud/utils/tests/test_telemetry.py +37 -0
  77. hud_python-0.2.2/tests/test_import.py → hud_python-0.2.3/hud/utils/tests/test_version.py +2 -1
  78. {hud_python-0.2.2 → hud_python-0.2.3}/pyproject.toml +26 -2
  79. hud_python-0.2.2/hud/server/__init__.py +0 -5
  80. hud_python-0.2.2/hud/server/requests.py +0 -280
  81. {hud_python-0.2.2 → hud_python-0.2.3}/.env.example +0 -0
  82. {hud_python-0.2.2 → hud_python-0.2.3}/.github/workflows/release.yml +0 -0
  83. {hud_python-0.2.2 → hud_python-0.2.3}/LICENSE +0 -0
  84. {hud_python-0.2.2 → hud_python-0.2.3}/MANIFEST.in +0 -0
  85. {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/cla-details.mdx +0 -0
  86. {hud_python-0.2.2 → hud_python-0.2.3}/docs/advanced/custom-environments.mdx +0 -0
  87. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api/reference/adapters.mdx +0 -0
  88. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/adapters.mdx +0 -0
  89. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/taskset.mdx +0 -0
  90. {hud_python-0.2.2 → hud_python-0.2.3}/docs/api-reference/trajectory.mdx +0 -0
  91. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/adapter.mdx +0 -0
  92. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/agent.mdx +0 -0
  93. {hud_python-0.2.2 → hud_python-0.2.3}/docs/concepts/trajectory.mdx +0 -0
  94. {hud_python-0.2.2 → hud_python-0.2.3}/docs/docs.json +0 -0
  95. {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/hud-browser.mdx +0 -0
  96. {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/hud-ubuntu.mdx +0 -0
  97. {hud_python-0.2.2 → hud_python-0.2.3}/docs/environments/qa.mdx +0 -0
  98. {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/basic.mdx +0 -0
  99. {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/claude-agent.mdx +0 -0
  100. {hud_python-0.2.2 → hud_python-0.2.3}/docs/examples/custom-agent.mdx +0 -0
  101. {hud_python-0.2.2 → hud_python-0.2.3}/docs/favicon.png +0 -0
  102. {hud_python-0.2.2 → hud_python-0.2.3}/docs/installation.mdx +0 -0
  103. {hud_python-0.2.2 → hud_python-0.2.3}/docs/logo/HUD-light-optimized.svg +0 -0
  104. {hud_python-0.2.2 → hud_python-0.2.3}/docs/logo/HUD.svg +0 -0
  105. {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/Dockerfile +0 -0
  106. {hud_python-0.2.2 → hud_python-0.2.3}/environments/novnc_ubuntu/pyproject.toml +0 -0
  107. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/Dockerfile +0 -0
  108. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/pyproject.toml +0 -0
  109. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
  110. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
  111. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
  112. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/info.py +0 -0
  113. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
  114. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
  115. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/step.py +0 -0
  116. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
  117. {hud_python-0.2.2 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
  118. {hud_python-0.2.2 → hud_python-0.2.3}/examples/README.md +0 -0
  119. {hud_python-0.2.2 → hud_python-0.2.3}/examples/WebVoyager_data.jsonl +0 -0
  120. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/__init__.py +0 -0
  121. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/claude/__init__.py +0 -0
  122. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/common/__init__.py +0 -0
  123. {hud_python-0.2.2 → hud_python-0.2.3/hud/adapters/common}/tests/__init__.py +0 -0
  124. {hud_python-0.2.2 → hud_python-0.2.3}/hud/adapters/operator/__init__.py +0 -0
  125. /hud_python-0.2.2/hud/py.typed → /hud_python-0.2.3/hud/evaluators/tests/__init__.py +0 -0
@@ -4,7 +4,7 @@ on:
4
4
  push:
5
5
  branches: [ "main" ]
6
6
  pull_request:
7
- branches: [ "main" ]
7
+ branches: [ "*" ]
8
8
 
9
9
  jobs:
10
10
  test:
@@ -24,7 +24,7 @@ jobs:
24
24
  run: uv python install ${{ matrix.python-version }}
25
25
 
26
26
  - name: Run tests
27
- run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest
27
+ run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest --rootdir=hud --cov --cov-report=''
28
28
 
29
29
  lint-ruff:
30
30
  runs-on: ubuntu-latest
@@ -35,7 +35,7 @@ jobs:
35
35
 
36
36
  - name: Run ruff
37
37
  run: |
38
- uv run --with=".[dev]" ruff format .
38
+ uv run --with=".[dev]" ruff format . --check
39
39
  uv run --with=".[dev]" ruff check .
40
40
 
41
41
  lint-pyright:
@@ -25,3 +25,5 @@ uv.lock
25
25
  /*.ipynb
26
26
  test.json
27
27
  TODO.md
28
+
29
+ .coverage
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -59,8 +59,11 @@ Requires-Dist: jupyter-client; extra == 'dev'
59
59
  Requires-Dist: jupyter-core; extra == 'dev'
60
60
  Requires-Dist: openai; extra == 'dev'
61
61
  Requires-Dist: pyright==1.1.364; extra == 'dev'
62
+ Requires-Dist: pytest-asyncio; extra == 'dev'
63
+ Requires-Dist: pytest-cov; extra == 'dev'
64
+ Requires-Dist: pytest-mock; extra == 'dev'
62
65
  Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
63
- Requires-Dist: ruff==0.9.8; extra == 'dev'
66
+ Requires-Dist: ruff==0.11.8; extra == 'dev'
64
67
  Description-Content-Type: text/markdown
65
68
 
66
69
  # HUD
@@ -94,17 +97,17 @@ pip install hud-python
94
97
 
95
98
  ### Simple Browser Example with Claude Computer Use
96
99
 
97
- > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
100
+ > This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
98
101
 
99
102
  Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
100
103
 
101
104
  ```python
102
105
  import asyncio
103
- from hud import gym, job
106
+ from hud import gym, register_job
104
107
  from hud.task import Task
105
108
  from hud.agent import ClaudeAgent
106
109
 
107
- @job("test-run")
110
+ @register_job("test-run")
108
111
  async def main():
109
112
  task = Task(
110
113
  prompt="Insert the text 'capybara' into the search bar",
@@ -192,7 +195,7 @@ If you use this SDK in your research, please cite it as follows:
192
195
 
193
196
  ```bibtex
194
197
  @software{hud2025agentevalplatform,
195
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
198
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
196
199
  title = {{HUD: An Evaluation Platform for Agents}},
197
200
  date = {2025-04},
198
201
  url = {https://github.com/hud-evals/hud-sdk},
@@ -29,17 +29,17 @@ pip install hud-python
29
29
 
30
30
  ### Simple Browser Example with Claude Computer Use
31
31
 
32
- > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
32
+ > This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
33
33
 
34
34
  Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
35
35
 
36
36
  ```python
37
37
  import asyncio
38
- from hud import gym, job
38
+ from hud import gym, register_job
39
39
  from hud.task import Task
40
40
  from hud.agent import ClaudeAgent
41
41
 
42
- @job("test-run")
42
+ @register_job("test-run")
43
43
  async def main():
44
44
  task = Task(
45
45
  prompt="Insert the text 'capybara' into the search bar",
@@ -127,7 +127,7 @@ If you use this SDK in your research, please cite it as follows:
127
127
 
128
128
  ```bibtex
129
129
  @software{hud2025agentevalplatform,
130
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
130
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
131
131
  title = {{HUD: An Evaluation Platform for Agents}},
132
132
  date = {2025-04},
133
133
  url = {https://github.com/hud-evals/hud-sdk},
@@ -12,11 +12,11 @@ While the standard `step`, `evaluate`, and `close` methods cover most interactio
12
12
  The `env._invoke_all()` method (and its underlying `client.invoke()`) is the core mechanism for calling specific functions *within* the environment's controller script.
13
13
 
14
14
  ```python
15
- async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
15
+ async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
16
16
  ```
17
17
 
18
18
  * **Purpose:** Execute custom functions defined in your environment controller (the Python code running inside the Docker container or remote instance). This is how `setup` and `evaluate` configurations in a `Task` are ultimately executed.
19
- * **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `HudStyleConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
19
+ * **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `FunctionConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
20
20
  * **When to Use:**
21
21
  * Triggering custom evaluation logic not suitable for the standard `evaluate` attribute.
22
22
  * Running specific diagnostic or state-setting functions within your custom environment controller during development or debugging.
@@ -71,7 +71,7 @@ print("Exit Code:", result['exit_code'])
71
71
  ## `_setup`
72
72
 
73
73
  ```python
74
- async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
74
+ async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
75
75
  ```
76
76
 
77
77
  * **Purpose:** Executes the setup configuration for the environment.
@@ -25,11 +25,11 @@ class Environment(pydantic.BaseModel):
25
25
  ) -> tuple[Observation, float, bool, dict[str, Any]]: ...
26
26
 
27
27
  async def evaluate(
28
- self, config: HudStyleConfigs | None = None
28
+ self, config: FunctionConfigs | None = None
29
29
  ) -> Any: ...
30
30
 
31
31
  async def reset(
32
- self, configs: HudStyleConfigs | None = None
32
+ self, configs: FunctionConfigs | None = None
33
33
  ) -> tuple[Observation, dict[str, Any]]: ...
34
34
 
35
35
  async def get_urls(self) -> dict[str, Any]: ...
@@ -37,8 +37,8 @@ class Environment(pydantic.BaseModel):
37
37
  async def close(self) -> None: ...
38
38
 
39
39
  # Internal/Advanced Methods
40
- # async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
41
- # async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
40
+ # async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
41
+ # async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
42
42
  ```
43
43
 
44
44
  Represents a running instance (browser, OS) where an [Agent](/concepts/agent) interacts. Environments are typically created using `hud.gym.make()` rather than direct construction.
@@ -58,11 +58,11 @@ Represents a running instance (browser, OS) where an [Agent](/concepts/agent) in
58
58
  * **Parameters:**
59
59
  * `actions`: List of [CLA](/concepts/adapter) actions, or `None` to get initial observation.
60
60
  * **Returns:** `(Observation, reward, terminated, info)` tuple. `reward` is typically 0 unless overridden by custom logic. `terminated` is typically `False`.
61
- * **`evaluate(self, config: HudStyleConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
61
+ * **`evaluate(self, config: FunctionConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
62
62
  * **Parameters:**
63
63
  * `config`: Optional override for evaluation logic using [Configuration Styles](/concepts/task#configuration-styles).
64
64
  * **Returns:** The result from the evaluation function(s).
65
- * **`reset(self, configs: HudStyleConfigs | None = None)`:** Resets the environment state, usually running setup logic.
65
+ * **`reset(self, configs: FunctionConfigs | None = None)`:** Resets the environment state, usually running setup logic.
66
66
  * **Parameters:**
67
67
  * `configs`: Optional override for setup logic.
68
68
  * **Returns:** `(Observation, info)` tuple after resetting. *(Note: `gym.make(task)` handles initial setup; direct `reset` is less common).*
@@ -20,7 +20,7 @@ async def make(
20
20
 
21
21
  Creates and initializes an [Environment](/concepts/environment) instance based on a specification.
22
22
 
23
- This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
23
+ This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@register_job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
24
24
 
25
25
  **Parameters:**
26
26
 
@@ -28,7 +28,7 @@ This function handles selecting the correct client (local docker, remote docker,
28
28
  * If a `str` (Gym ID like `"hud-browser"`, `"OSWorld-Ubuntu"`), creates a standard remote environment.
29
29
  * If a `CustomGym` object, creates a custom environment based on its definition (local or remote docker).
30
30
  * If a `Task` object, uses the `task.gym` attribute to determine the environment type and automatically runs `task.setup` after creation.
31
- * **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@job` decorator.
31
+ * **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@register_job` decorator.
32
32
  * **`metadata` (dict[str, Any] | None, optional):** Additional metadata to attach to the environment instance and its resulting trajectory.
33
33
 
34
34
  **Returns:**
@@ -3,13 +3,13 @@ title: 'hud.job'
3
3
  description: 'API reference for Jobs and related functions/decorators'
4
4
  ---
5
5
 
6
- The `hud.job` module provides the `@job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
6
+ The `hud.job` module provides the `@register_job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
7
7
 
8
8
  See the [Job Concepts](/concepts/job) page for explanations and usage examples.
9
9
 
10
10
  # Decorators
11
11
 
12
- ## @job
12
+ ## @register_job
13
13
 
14
14
  ```python
15
15
  def job(
@@ -92,7 +92,7 @@ class Job(pydantic.BaseModel):
92
92
  ) -> list[Trajectory]: ...
93
93
  ```
94
94
 
95
- Represents a Job, typically obtained via `@job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
95
+ Represents a Job, typically obtained via `@register_job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
96
96
 
97
97
  **Attributes:**
98
98
 
@@ -13,8 +13,8 @@ The `hud.task` module provides the `Task` class for defining evaluation scenario
13
13
  class Task(pydantic.BaseModel):
14
14
  id: str | None = None
15
15
  prompt: str
16
- setup: HudStyleConfigs | None = None
17
- evaluate: HudStyleConfigs | None = None
16
+ setup: FunctionConfigs | None = None
17
+ evaluate: FunctionConfigs | None = None
18
18
  gym: Gym | None = None
19
19
  target: str | list[str] | None = None # Inspect compatibility
20
20
  choices: list[str] | None = None # Inspect compatibility
@@ -33,8 +33,8 @@ See the [Tasks and TaskSets Concepts](/concepts/task) page for detailed explanat
33
33
 
34
34
  * **`id` (str | None):** Optional unique identifier, often assigned when loaded from the HUD platform.
35
35
  * **`prompt` (str):** The main instruction or goal for the agent.
36
- * **`setup` (`HudStyleConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
37
- * **`evaluate` (`HudStyleConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
36
+ * **`setup` (`FunctionConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
37
+ * **`evaluate` (`FunctionConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
38
38
  * **`gym` (`Gym` | None):** Specifies the required environment type (e.g., `"hud-browser"`, `CustomGym` object). See `hud.types`.
39
39
  * **`target` (str | list[str] | None):** Ideal target output (primarily for compatibility with `inspect-ai`).
40
40
  * **`choices` (list[str] | None):** Multiple choice options (primarily for compatibility with `inspect-ai`).
@@ -46,7 +46,7 @@ env_os = await gym.make("OSWorld-Ubuntu")
46
46
  # await env_os.close()
47
47
  ```
48
48
 
49
- Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@job` decorator.
49
+ Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@register_job` decorator.
50
50
 
51
51
  ## Available Environment Types
52
52
 
@@ -92,10 +92,10 @@ for _ in range(10):
92
92
  ## Key Methods
93
93
 
94
94
  * **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
95
- * **`env.evaluate(config: HudStyleConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
95
+ * **`env.evaluate(config: FunctionConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
96
96
  * **`env.close()`**: Shuts down the environment. Saves the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
97
97
  * **`env.get_urls()`**: Returns URLs (`url`, `live_url`) for accessing/viewing the environment.
98
- * **`env.reset(configs: HudStyleConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
98
+ * **`env.reset(configs: FunctionConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
99
99
  * **`env._setup(...)` / `env._invoke_all(...)`**: Internal methods for running setup/evaluate/custom configurations defined in a [Task](/concepts/task).
100
100
 
101
101
  ## Observations
@@ -110,5 +110,5 @@ The `Observation` object returned by `env.step()` contains:
110
110
  * [Task](/concepts/task): Defines the environment type (`gym`), `setup`, and `evaluate` logic.
111
111
  * [Agent](/concepts/agent): Interacts with the Environment via the `step` and `predict` methods.
112
112
  * [Adapter](/concepts/adapter): Ensures actions passed to `step` are in the correct `CLA` format.
113
- * [Job](/concepts/job): Groups environment runs; linking happens via `@job` or `gym.make(job=...)`.
113
+ * [Job](/concepts/job): Groups environment runs; linking happens via `@register_job` or `gym.make(job=...)`.
114
114
  * [Trajectory](/concepts/trajectory): The recording generated when a job-linked environment is closed.
@@ -18,16 +18,16 @@ Jobs help organize evaluation data, useful for:
18
18
 
19
19
  ## Creating Jobs
20
20
 
21
- ### 1. The `@job` Decorator (Recommended)
21
+ ### 1. The `@register_job` Decorator (Recommended)
22
22
 
23
23
  Decorate an `async` function. A new Job is created per function call, and any environments created within using `hud.gym.make()` are automatically linked.
24
24
 
25
25
  ```python
26
- from hud import gym, job
26
+ from hud import gym, register_job
27
27
  from hud.task import Task
28
28
  from hud.agent import OperatorAgent # Example agent
29
29
 
30
- @job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
30
+ @register_job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
31
31
  async def run_evaluation():
32
32
  task = Task(prompt="Example", gym="hud-browser")
33
33
  env = await gym.make(task) # Linked to "my-evaluation-run" job
@@ -89,7 +89,7 @@ async def analyze_job(job_id: str):
89
89
 
90
90
  ## Best Practices
91
91
 
92
- * Use `@job` for most scripts.
92
+ * Use `@register_job` for most scripts.
93
93
  * Use descriptive names and metadata.
94
94
  * Create separate jobs for distinct experiments.
95
95
 
@@ -15,8 +15,8 @@ A `Task` object provides the configuration for a specific scenario.
15
15
 
16
16
  * **`prompt` (str):** The primary instruction given to the agent.
17
17
  * **`gym` (str | `CustomGym` | None):** Specifies the type of [Environment](/concepts/environment) needed. Used by `hud.gym.make()`.
18
- * **`setup` (`HudStyleConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
19
- * **`evaluate` (`HudStyleConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
18
+ * **`setup` (`FunctionConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
19
+ * **`evaluate` (`FunctionConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
20
20
  * **`id` (str | None):** Optional identifier.
21
21
  * **`metadata` (dict | None):** Optional dictionary for extra information.
22
22
  * **`config` (dict | None):** Optional dictionary, primarily for remote execution.
@@ -41,7 +41,7 @@ task = Task(
41
41
 
42
42
  ### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
43
43
 
44
- Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`HudStyleConfigs`):
44
+ Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`FunctionConfigs`):
45
45
 
46
46
  1. **String:** `"browser.maximize"`
47
47
  2. **Tuple:** `("goto", "https://google.com")`
@@ -43,13 +43,13 @@ This example uses the `OperatorAgent` to interact with a browser environment. It
43
43
  ```python
44
44
  import asyncio
45
45
  import os
46
- from hud import gym, job # Import gym for environments and job decorator
46
+ from hud import gym, register_job # Import gym for environments and job decorator
47
47
  from hud.task import Task # Import Task to define the goal
48
48
  from hud.agent import OperatorAgent # Import the agent
49
49
  # hud.settings automatically loads keys from .env or environment variables
50
50
 
51
51
  # Decorator to group this run under a job named "quickstart-run"
52
- @job("quickstart-run")
52
+ @register_job("quickstart-run")
53
53
  async def main():
54
54
  # 1. Define a Task: What should the agent do?
55
55
  task = Task(
@@ -111,7 +111,7 @@ if __name__ == "__main__":
111
111
  * `agent.predict(obs)` gets the next action(s) from the agent.
112
112
  * `env.step(actions)` executes the actions and gets the new observation.
113
113
  5. **Evaluation & Close:** `env.evaluate()` checks if the task succeeded based on the `evaluate` definition. `env.close()` shuts down the environment.
114
- 6. **`@job` Decorator:** Wrapping `main` with `@job("quickstart-run")` automatically creates a Job. When `env.close()` is called, the recorded interactions (trajectory) are associated with this Job. You can view the job and its trajectory video on the [HUD Jobs page](https://app.hud.so/jobs).
114
+ 6. **`@register_job` Decorator:** Wrapping `main` with `@register_job("quickstart-run")` automatically creates a Job. When `env.close()` is called, the recorded interactions (trajectory) are associated with this Job. You can view the job and its trajectory video on the [HUD Jobs page](https://app.hud.so/jobs).
115
115
 
116
116
  ## Next Steps
117
117
 
@@ -29,7 +29,7 @@ This is the most straightforward approach if your agent logic can directly gener
29
29
 
30
30
  ```python
31
31
  import asyncio
32
- from hud import gym, job
32
+ from hud import gym, register_job
33
33
  from hud.task import Task
34
34
  from hud.env import Observation
35
35
  # Import specific CLA types you need
@@ -48,7 +48,7 @@ def my_custom_agent_logic(observation: Observation) -> list[CLA]:
48
48
  # Ensure the return type is list[CLA]
49
49
  return actions
50
50
 
51
- @job("custom-cla-agent-run")
51
+ @register_job("custom-cla-agent-run")
52
52
  async def main():
53
53
  task = Task(prompt="Click and type", gym="hud-browser")
54
54
  env = await gym.make(task)
@@ -90,7 +90,7 @@ This approach leverages the SDK's structure for a more integrated solution.
90
90
  ```python
91
91
  import asyncio
92
92
  from typing import Any # Placeholder for your raw action type
93
- from hud import gym, job
93
+ from hud import gym, register_job
94
94
  from hud.task import Task
95
95
  from hud.env import Observation
96
96
  from hud.agent import Agent # Import base class
@@ -136,7 +136,7 @@ class MyAdapter(Adapter):
136
136
  raise ValueError(f"Unknown raw action type: {raw_action}")
137
137
 
138
138
  # --- Usage ---
139
- @job("custom-agent-framework-run")
139
+ @register_job("custom-agent-framework-run")
140
140
  async def main():
141
141
  task = Task(prompt="Use custom agent", gym="hud-browser")
142
142
  env = await gym.make(task)
@@ -181,13 +181,13 @@ This approach uses HUD primarily for environment provisioning and lifecycle mana
181
181
  ```python
182
182
  import asyncio
183
183
  import os
184
- from hud import gym, job
184
+ from hud import gym, register_job
185
185
  from hud.task import Task
186
186
  from hud.utils import stream # For live view
187
187
  # Need external library, e.g., pyppeteer (pip install pyppeteer)
188
188
  # import pyppeteer
189
189
 
190
- @job("external-control-run")
190
+ @register_job("external-control-run")
191
191
  async def main():
192
192
  task = Task(prompt="Externally controlled task", gym="hud-browser", setup=("goto", "google.com"))
193
193
  env = await gym.make(task)
@@ -1,4 +1,5 @@
1
1
  """Initialize the local-qa environment package."""
2
+
2
3
  from __future__ import annotations
3
4
 
4
5
  from .step import step