hud-python 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {hud_python-0.2.1 → hud_python-0.2.3}/.github/workflows/ci.yml +3 -3
  2. {hud_python-0.2.1 → hud_python-0.2.3}/.gitignore +2 -0
  3. {hud_python-0.2.1 → hud_python-0.2.3}/PKG-INFO +44 -21
  4. {hud_python-0.2.1 → hud_python-0.2.3}/README.md +32 -18
  5. {hud_python-0.2.1 → hud_python-0.2.3}/docs/advanced/cla-details.mdx +5 -0
  6. {hud_python-0.2.1 → hud_python-0.2.3}/docs/advanced/environment-control.mdx +3 -3
  7. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/env.mdx +6 -6
  8. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/gym.mdx +2 -2
  9. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/job.mdx +3 -3
  10. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/task.mdx +4 -4
  11. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/environment.mdx +18 -4
  12. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/job.mdx +4 -4
  13. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/task.mdx +8 -7
  14. {hud_python-0.2.1 → hud_python-0.2.3}/docs/docs.json +8 -0
  15. hud_python-0.2.3/docs/environments/hud-browser.mdx +67 -0
  16. hud_python-0.2.3/docs/environments/hud-ubuntu.mdx +55 -0
  17. hud_python-0.2.3/docs/environments/qa.mdx +68 -0
  18. {hud_python-0.2.1 → hud_python-0.2.3}/docs/quickstart.mdx +4 -8
  19. {hud_python-0.2.1 → hud_python-0.2.3}/docs/running-your-agent.mdx +6 -6
  20. {hud_python-0.2.1 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +1 -0
  21. {hud_python-0.2.1 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +76 -63
  22. {hud_python-0.2.1 → hud_python-0.2.3}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -1
  23. hud_python-0.2.3/examples/WebVoyager_data.jsonl +643 -0
  24. {hud_python-0.2.1 → hud_python-0.2.3}/examples/browser_use.ipynb +5 -11
  25. hud_python-0.2.3/examples/ds_upload.ipynb +2316 -0
  26. hud_python-0.2.3/examples/example.ipynb +86 -0
  27. hud_python-0.2.3/examples/inspect.ipynb +2087 -0
  28. {hud_python-0.2.1 → hud_python-0.2.3}/examples/jobs.ipynb +4 -3
  29. {hud_python-0.2.1 → hud_python-0.2.3}/examples/local.ipynb +7 -18
  30. {hud_python-0.2.1 → hud_python-0.2.3}/examples/osworld.ipynb +32 -30
  31. {hud_python-0.2.1 → hud_python-0.2.3}/examples/tasks.ipynb +19 -20
  32. {hud_python-0.2.1 → hud_python-0.2.3}/hud/__init__.py +5 -3
  33. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/__init__.py +2 -1
  34. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/claude/adapter.py +13 -17
  35. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/common/adapter.py +3 -3
  36. hud_python-0.2.3/hud/adapters/common/tests/test_adapter.py +277 -0
  37. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/common/types.py +3 -6
  38. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/operator/adapter.py +22 -29
  39. hud_python-0.2.3/hud/agent/__init__.py +15 -0
  40. {hud_python-0.2.1 → hud_python-0.2.3}/hud/agent/base.py +28 -28
  41. {hud_python-0.2.1 → hud_python-0.2.3}/hud/agent/claude.py +69 -60
  42. hud_python-0.2.3/hud/agent/langchain.py +204 -0
  43. {hud_python-0.2.1 → hud_python-0.2.3}/hud/agent/operator.py +75 -67
  44. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/__init__.py +5 -5
  45. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/client.py +2 -2
  46. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/docker_client.py +37 -39
  47. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/environment.py +91 -66
  48. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/local_docker_client.py +5 -7
  49. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/remote_client.py +40 -29
  50. {hud_python-0.2.1 → hud_python-0.2.3}/hud/env/remote_docker_client.py +13 -3
  51. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/__init__.py +2 -3
  52. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/base.py +4 -3
  53. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/inspect.py +3 -8
  54. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/judge.py +34 -58
  55. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/match.py +42 -49
  56. {hud_python-0.2.1 → hud_python-0.2.3}/hud/evaluators/remote.py +13 -26
  57. hud_python-0.2.3/hud/evaluators/tests/test_inspect.py +12 -0
  58. hud_python-0.2.3/hud/evaluators/tests/test_judge.py +231 -0
  59. hud_python-0.2.3/hud/evaluators/tests/test_match.py +115 -0
  60. hud_python-0.2.3/hud/evaluators/tests/test_remote.py +98 -0
  61. hud_python-0.2.3/hud/exceptions.py +167 -0
  62. {hud_python-0.2.1 → hud_python-0.2.3}/hud/gym.py +12 -10
  63. hud_python-0.2.3/hud/job.py +663 -0
  64. hud_python-0.2.3/hud/py.typed +0 -0
  65. hud_python-0.2.3/hud/server/__init__.py +5 -0
  66. hud_python-0.2.3/hud/server/requests.py +242 -0
  67. hud_python-0.2.3/hud/server/tests/__init__.py +0 -0
  68. hud_python-0.2.3/hud/server/tests/test_requests.py +275 -0
  69. {hud_python-0.2.1 → hud_python-0.2.3}/hud/settings.py +3 -2
  70. {hud_python-0.2.1 → hud_python-0.2.3}/hud/task.py +12 -22
  71. {hud_python-0.2.1 → hud_python-0.2.3}/hud/taskset.py +44 -11
  72. {hud_python-0.2.1 → hud_python-0.2.3}/hud/trajectory.py +6 -9
  73. {hud_python-0.2.1 → hud_python-0.2.3}/hud/types.py +14 -9
  74. {hud_python-0.2.1 → hud_python-0.2.3}/hud/utils/__init__.py +2 -2
  75. {hud_python-0.2.1 → hud_python-0.2.3}/hud/utils/common.py +37 -13
  76. {hud_python-0.2.1 → hud_python-0.2.3}/hud/utils/config.py +44 -29
  77. hud_python-0.2.3/hud/utils/progress.py +149 -0
  78. {hud_python-0.2.1 → hud_python-0.2.3}/hud/utils/telemetry.py +10 -11
  79. hud_python-0.2.3/hud/utils/tests/__init__.py +0 -0
  80. hud_python-0.2.3/hud/utils/tests/test_common.py +52 -0
  81. hud_python-0.2.3/hud/utils/tests/test_config.py +129 -0
  82. hud_python-0.2.3/hud/utils/tests/test_progress.py +225 -0
  83. hud_python-0.2.3/hud/utils/tests/test_telemetry.py +37 -0
  84. hud_python-0.2.1/tests/test_import.py → hud_python-0.2.3/hud/utils/tests/test_version.py +2 -1
  85. {hud_python-0.2.1 → hud_python-0.2.3}/pyproject.toml +33 -3
  86. hud_python-0.2.1/examples/inspect.ipynb +0 -169
  87. hud_python-0.2.1/hud/agent/__init__.py +0 -7
  88. hud_python-0.2.1/hud/job.py +0 -185
  89. hud_python-0.2.1/hud/server/__init__.py +0 -5
  90. hud_python-0.2.1/hud/server/requests.py +0 -280
  91. {hud_python-0.2.1 → hud_python-0.2.3}/.env.example +0 -0
  92. {hud_python-0.2.1 → hud_python-0.2.3}/.github/workflows/release.yml +0 -0
  93. {hud_python-0.2.1 → hud_python-0.2.3}/LICENSE +0 -0
  94. {hud_python-0.2.1 → hud_python-0.2.3}/MANIFEST.in +0 -0
  95. {hud_python-0.2.1 → hud_python-0.2.3}/docs/advanced/custom-environments.mdx +0 -0
  96. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api/reference/adapters.mdx +0 -0
  97. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/adapters.mdx +0 -0
  98. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/taskset.mdx +0 -0
  99. {hud_python-0.2.1 → hud_python-0.2.3}/docs/api-reference/trajectory.mdx +0 -0
  100. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/adapter.mdx +0 -0
  101. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/agent.mdx +0 -0
  102. {hud_python-0.2.1 → hud_python-0.2.3}/docs/concepts/trajectory.mdx +0 -0
  103. {hud_python-0.2.1 → hud_python-0.2.3}/docs/examples/basic.mdx +0 -0
  104. {hud_python-0.2.1 → hud_python-0.2.3}/docs/examples/claude-agent.mdx +0 -0
  105. {hud_python-0.2.1 → hud_python-0.2.3}/docs/examples/custom-agent.mdx +0 -0
  106. {hud_python-0.2.1 → hud_python-0.2.3}/docs/favicon.png +0 -0
  107. {hud_python-0.2.1 → hud_python-0.2.3}/docs/installation.mdx +0 -0
  108. {hud_python-0.2.1 → hud_python-0.2.3}/docs/logo/HUD-light-optimized.svg +0 -0
  109. {hud_python-0.2.1 → hud_python-0.2.3}/docs/logo/HUD.svg +0 -0
  110. {hud_python-0.2.1 → hud_python-0.2.3}/environments/novnc_ubuntu/Dockerfile +0 -0
  111. {hud_python-0.2.1 → hud_python-0.2.3}/environments/novnc_ubuntu/pyproject.toml +0 -0
  112. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/Dockerfile +0 -0
  113. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/pyproject.toml +0 -0
  114. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
  115. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
  116. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
  117. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/info.py +0 -0
  118. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
  119. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
  120. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/step.py +0 -0
  121. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
  122. {hud_python-0.2.1 → hud_python-0.2.3}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
  123. {hud_python-0.2.1 → hud_python-0.2.3}/examples/README.md +0 -0
  124. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/claude/__init__.py +0 -0
  125. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/common/__init__.py +0 -0
  126. {hud_python-0.2.1 → hud_python-0.2.3/hud/adapters/common}/tests/__init__.py +0 -0
  127. {hud_python-0.2.1 → hud_python-0.2.3}/hud/adapters/operator/__init__.py +0 -0
  128. /hud_python-0.2.1/hud/py.typed → /hud_python-0.2.3/hud/evaluators/tests/__init__.py +0 -0
@@ -4,7 +4,7 @@ on:
4
4
  push:
5
5
  branches: [ "main" ]
6
6
  pull_request:
7
- branches: [ "main" ]
7
+ branches: [ "*" ]
8
8
 
9
9
  jobs:
10
10
  test:
@@ -24,7 +24,7 @@ jobs:
24
24
  run: uv python install ${{ matrix.python-version }}
25
25
 
26
26
  - name: Run tests
27
- run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest
27
+ run: uv run --python ${{ matrix.python-version }} --with=".[dev]" pytest --rootdir=hud --cov --cov-report=''
28
28
 
29
29
  lint-ruff:
30
30
  runs-on: ubuntu-latest
@@ -35,7 +35,7 @@ jobs:
35
35
 
36
36
  - name: Run ruff
37
37
  run: |
38
- uv run --with=".[dev]" ruff format .
38
+ uv run --with=".[dev]" ruff format . --check
39
39
  uv run --with=".[dev]" ruff check .
40
40
 
41
41
  lint-pyright:
@@ -25,3 +25,5 @@ uv.lock
25
25
  /*.ipynb
26
26
  test.json
27
27
  TODO.md
28
+
29
+ .coverage
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
7
  Project-URL: Documentation, https://hud.so
8
- Author-email: Human Union Data SDK <founders@hud.so>
8
+ Author-email: HUD SDK <founders@hud.so>
9
9
  License: MIT License
10
10
 
11
11
  Copyright (c) 2025 Human Union Data, Inc
@@ -37,8 +37,14 @@ Classifier: Programming Language :: Python :: 3.12
37
37
  Classifier: Programming Language :: Python :: 3.13
38
38
  Requires-Python: <3.14,>=3.10
39
39
  Requires-Dist: aiodocker>=0.24.0
40
+ Requires-Dist: anthropic
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: inspect-ai>=0.3.80
43
+ Requires-Dist: ipykernel
44
+ Requires-Dist: langchain
45
+ Requires-Dist: langchain-openai
46
+ Requires-Dist: numpy
47
+ Requires-Dist: openai
42
48
  Requires-Dist: pillow>=11.1.0
43
49
  Requires-Dist: pydantic-settings<3,>=2
44
50
  Requires-Dist: pydantic<3,>=2
@@ -53,8 +59,11 @@ Requires-Dist: jupyter-client; extra == 'dev'
53
59
  Requires-Dist: jupyter-core; extra == 'dev'
54
60
  Requires-Dist: openai; extra == 'dev'
55
61
  Requires-Dist: pyright==1.1.364; extra == 'dev'
62
+ Requires-Dist: pytest-asyncio; extra == 'dev'
63
+ Requires-Dist: pytest-cov; extra == 'dev'
64
+ Requires-Dist: pytest-mock; extra == 'dev'
56
65
  Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
57
- Requires-Dist: ruff==0.9.8; extra == 'dev'
66
+ Requires-Dist: ruff==0.11.8; extra == 'dev'
58
67
  Description-Content-Type: text/markdown
59
68
 
60
69
  # HUD
@@ -88,17 +97,17 @@ pip install hud-python
88
97
 
89
98
  ### Simple Browser Example with Claude Computer Use
90
99
 
91
- > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
100
+ > This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
92
101
 
93
102
  Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
94
103
 
95
104
  ```python
96
105
  import asyncio
97
- from hud import gym, job
106
+ from hud import gym, register_job
98
107
  from hud.task import Task
99
108
  from hud.agent import ClaudeAgent
100
109
 
101
- @job("test-run")
110
+ @register_job("test-run")
102
111
  async def main():
103
112
  task = Task(
104
113
  prompt="Insert the text 'capybara' into the search bar",
@@ -117,10 +126,9 @@ async def main():
117
126
  obs, _ = await env.reset() # Gets first observation
118
127
  for i in range(5):
119
128
  actions, done = await agent.predict(obs)
120
- if done:
121
- break
122
-
129
+
123
130
  obs, reward, terminated, info = await env.step(actions)
131
+ if done or terminated: break
124
132
 
125
133
  # Evaluate and close
126
134
  result = await env.evaluate()
@@ -132,22 +140,37 @@ if __name__ == "__main__":
132
140
 
133
141
  ```
134
142
 
143
+ Alternatively, run a full evaluation set via the ```run_job``` command:
144
+
145
+ ```python
146
+ from hud import load_taskset, run_job, ClaudeAgent
147
+
148
+ # load
149
+ taskset = load_taskset("GAIA")
150
+
151
+ # evaluate
152
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
153
+
154
+ # get results OR view them in app.hud.so
155
+ print(await job.get_analytics())
156
+ ```
157
+
135
158
  ## Documentation Sections
136
159
 
137
160
  Explore the core concepts and features of the SDK:
138
161
 
139
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
140
- * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
141
- * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
142
- * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
143
- * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
144
- * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
162
+ * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
163
+ * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
164
+ * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
165
+ * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
166
+ * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
167
+ * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
145
168
  * **Advanced Topics**:
146
- * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
147
- * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
148
- * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
169
+ * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
170
+ * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
171
+ * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
149
172
 
150
- * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
173
+ * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
151
174
 
152
175
  ## [Examples](examples/)
153
176
 
@@ -160,7 +183,7 @@ We recommend you first take a look at the example notebooks showing how to use t
160
183
 
161
184
  ## Documentation
162
185
 
163
- For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
186
+ For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
164
187
 
165
188
  ## License
166
189
 
@@ -172,7 +195,7 @@ If you use this SDK in your research, please cite it as follows:
172
195
 
173
196
  ```bibtex
174
197
  @software{hud2025agentevalplatform,
175
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
198
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
176
199
  title = {{HUD: An Evaluation Platform for Agents}},
177
200
  date = {2025-04},
178
201
  url = {https://github.com/hud-evals/hud-sdk},
@@ -29,17 +29,17 @@ pip install hud-python
29
29
 
30
30
  ### Simple Browser Example with Claude Computer Use
31
31
 
32
- > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
32
+ > This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
33
33
 
34
34
  Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
35
35
 
36
36
  ```python
37
37
  import asyncio
38
- from hud import gym, job
38
+ from hud import gym, register_job
39
39
  from hud.task import Task
40
40
  from hud.agent import ClaudeAgent
41
41
 
42
- @job("test-run")
42
+ @register_job("test-run")
43
43
  async def main():
44
44
  task = Task(
45
45
  prompt="Insert the text 'capybara' into the search bar",
@@ -58,10 +58,9 @@ async def main():
58
58
  obs, _ = await env.reset() # Gets first observation
59
59
  for i in range(5):
60
60
  actions, done = await agent.predict(obs)
61
- if done:
62
- break
63
-
61
+
64
62
  obs, reward, terminated, info = await env.step(actions)
63
+ if done or terminated: break
65
64
 
66
65
  # Evaluate and close
67
66
  result = await env.evaluate()
@@ -73,22 +72,37 @@ if __name__ == "__main__":
73
72
 
74
73
  ```
75
74
 
75
+ Alternatively, run a full evaluation set via the ```run_job``` command:
76
+
77
+ ```python
78
+ from hud import load_taskset, run_job, ClaudeAgent
79
+
80
+ # load
81
+ taskset = load_taskset("GAIA")
82
+
83
+ # evaluate
84
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
85
+
86
+ # get results OR view them in app.hud.so
87
+ print(await job.get_analytics())
88
+ ```
89
+
76
90
  ## Documentation Sections
77
91
 
78
92
  Explore the core concepts and features of the SDK:
79
93
 
80
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
81
- * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
82
- * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
83
- * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
84
- * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
85
- * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
94
+ * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
95
+ * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
96
+ * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
97
+ * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
98
+ * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
99
+ * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
86
100
  * **Advanced Topics**:
87
- * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
88
- * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
89
- * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
101
+ * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
102
+ * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
103
+ * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
90
104
 
91
- * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
105
+ * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
92
106
 
93
107
  ## [Examples](examples/)
94
108
 
@@ -101,7 +115,7 @@ We recommend you first take a look at the example notebooks showing how to use t
101
115
 
102
116
  ## Documentation
103
117
 
104
- For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
118
+ For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
105
119
 
106
120
  ## License
107
121
 
@@ -113,7 +127,7 @@ If you use this SDK in your research, please cite it as follows:
113
127
 
114
128
  ```bibtex
115
129
  @software{hud2025agentevalplatform,
116
- author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
130
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
117
131
  title = {{HUD: An Evaluation Platform for Agents}},
118
132
  date = {2025-04},
119
133
  url = {https://github.com/hud-evals/hud-sdk},
@@ -69,6 +69,11 @@ Here are some key CLA types grouped by category:
69
69
  * **`ScreenshotFetch`**: Requests a screenshot (used internally, typically not sent by agents directly).
70
70
  * **`PositionFetch`**: Requests the current cursor position (used internally).
71
71
 
72
+ ### Response Actions
73
+
74
+ * **`ResponseAction`**: Used to submit a final text answer.
75
+ * `text: str`: The final textual response from the agent.
76
+
72
77
  ### Custom Actions
73
78
 
74
79
  * **`CustomAction`**: Allows defining arbitrary actions specific to a custom environment controller.
@@ -12,11 +12,11 @@ While the standard `step`, `evaluate`, and `close` methods cover most interactio
12
12
  The `env._invoke_all()` method (and its underlying `client.invoke()`) is the core mechanism for calling specific functions *within* the environment's controller script.
13
13
 
14
14
  ```python
15
- async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
15
+ async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
16
16
  ```
17
17
 
18
18
  * **Purpose:** Execute custom functions defined in your environment controller (the Python code running inside the Docker container or remote instance). This is how `setup` and `evaluate` configurations in a `Task` are ultimately executed.
19
- * **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `HudStyleConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
19
+ * **Usage:** You provide a configuration (string, tuple, dict, or list) matching the `FunctionConfigs` format. The SDK sends this to the environment controller, which runs the specified function(s) with the given arguments.
20
20
  * **When to Use:**
21
21
  * Triggering custom evaluation logic not suitable for the standard `evaluate` attribute.
22
22
  * Running specific diagnostic or state-setting functions within your custom environment controller during development or debugging.
@@ -71,7 +71,7 @@ print("Exit Code:", result['exit_code'])
71
71
  ## `_setup`
72
72
 
73
73
  ```python
74
- async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
74
+ async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
75
75
  ```
76
76
 
77
77
  * **Purpose:** Executes the setup configuration for the environment.
@@ -25,11 +25,11 @@ class Environment(pydantic.BaseModel):
25
25
  ) -> tuple[Observation, float, bool, dict[str, Any]]: ...
26
26
 
27
27
  async def evaluate(
28
- self, config: HudStyleConfigs | None = None
28
+ self, config: FunctionConfigs | None = None
29
29
  ) -> Any: ...
30
30
 
31
31
  async def reset(
32
- self, configs: HudStyleConfigs | None = None
32
+ self, configs: FunctionConfigs | None = None
33
33
  ) -> tuple[Observation, dict[str, Any]]: ...
34
34
 
35
35
  async def get_urls(self) -> dict[str, Any]: ...
@@ -37,8 +37,8 @@ class Environment(pydantic.BaseModel):
37
37
  async def close(self) -> None: ...
38
38
 
39
39
  # Internal/Advanced Methods
40
- # async def _setup(self, config: HudStyleConfigs | None = None) -> None: ...
41
- # async def _invoke_all(self, configs: HudStyleConfigs) -> list[Any]: ...
40
+ # async def _setup(self, config: FunctionConfigs | None = None) -> None: ...
41
+ # async def _invoke_all(self, configs: FunctionConfigs) -> list[Any]: ...
42
42
  ```
43
43
 
44
44
  Represents a running instance (browser, OS) where an [Agent](/concepts/agent) interacts. Environments are typically created using `hud.gym.make()` rather than direct construction.
@@ -58,11 +58,11 @@ Represents a running instance (browser, OS) where an [Agent](/concepts/agent) in
58
58
  * **Parameters:**
59
59
  * `actions`: List of [CLA](/concepts/adapter) actions, or `None` to get initial observation.
60
60
  * **Returns:** `(Observation, reward, terminated, info)` tuple. `reward` is typically 0 unless overridden by custom logic. `terminated` is typically `False`.
61
- * **`evaluate(self, config: HudStyleConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
61
+ * **`evaluate(self, config: FunctionConfigs | None = None)`:** Runs the evaluation logic defined in the [Task](/concepts/task) (or the provided `config`).
62
62
  * **Parameters:**
63
63
  * `config`: Optional override for evaluation logic using [Configuration Styles](/concepts/task#configuration-styles).
64
64
  * **Returns:** The result from the evaluation function(s).
65
- * **`reset(self, configs: HudStyleConfigs | None = None)`:** Resets the environment state, usually running setup logic.
65
+ * **`reset(self, configs: FunctionConfigs | None = None)`:** Resets the environment state, usually running setup logic.
66
66
  * **Parameters:**
67
67
  * `configs`: Optional override for setup logic.
68
68
  * **Returns:** `(Observation, info)` tuple after resetting. *(Note: `gym.make(task)` handles initial setup; direct `reset` is less common).*
@@ -20,7 +20,7 @@ async def make(
20
20
 
21
21
  Creates and initializes an [Environment](/concepts/environment) instance based on a specification.
22
22
 
23
- This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
23
+ This function handles selecting the correct client (local docker, remote docker, remote direct) based on the `env_src`, automatically linking to an active [Job](/concepts/job) (from `@register_job` decorator or the `job` parameter), and running the initial [Task](/concepts/task) setup if `env_src` is a `Task`.
24
24
 
25
25
  **Parameters:**
26
26
 
@@ -28,7 +28,7 @@ This function handles selecting the correct client (local docker, remote docker,
28
28
  * If a `str` (Gym ID like `"hud-browser"`, `"OSWorld-Ubuntu"`), creates a standard remote environment.
29
29
  * If a `CustomGym` object, creates a custom environment based on its definition (local or remote docker).
30
30
  * If a `Task` object, uses the `task.gym` attribute to determine the environment type and automatically runs `task.setup` after creation.
31
- * **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@job` decorator.
31
+ * **`job` (`Job` | None, optional):** A specific [Job](/concepts/job) object to associate this environment run with. If `None`, it attempts to find an active job created by the `@register_job` decorator.
32
32
  * **`metadata` (dict[str, Any] | None, optional):** Additional metadata to attach to the environment instance and its resulting trajectory.
33
33
 
34
34
  **Returns:**
@@ -3,13 +3,13 @@ title: 'hud.job'
3
3
  description: 'API reference for Jobs and related functions/decorators'
4
4
  ---
5
5
 
6
- The `hud.job` module provides the `@job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
6
+ The `hud.job` module provides the `@register_job` decorator, functions to manage Jobs (`create_job`, `load_job`), and the `Job` class itself.
7
7
 
8
8
  See the [Job Concepts](/concepts/job) page for explanations and usage examples.
9
9
 
10
10
  # Decorators
11
11
 
12
- ## @job
12
+ ## @register_job
13
13
 
14
14
  ```python
15
15
  def job(
@@ -92,7 +92,7 @@ class Job(pydantic.BaseModel):
92
92
  ) -> list[Trajectory]: ...
93
93
  ```
94
94
 
95
- Represents a Job, typically obtained via `@job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
95
+ Represents a Job, typically obtained via `@register_job`, `create_job`, or `load_job`. Primarily used to access associated trajectories.
96
96
 
97
97
  **Attributes:**
98
98
 
@@ -13,8 +13,8 @@ The `hud.task` module provides the `Task` class for defining evaluation scenario
13
13
  class Task(pydantic.BaseModel):
14
14
  id: str | None = None
15
15
  prompt: str
16
- setup: HudStyleConfigs | None = None
17
- evaluate: HudStyleConfigs | None = None
16
+ setup: FunctionConfigs | None = None
17
+ evaluate: FunctionConfigs | None = None
18
18
  gym: Gym | None = None
19
19
  target: str | list[str] | None = None # Inspect compatibility
20
20
  choices: list[str] | None = None # Inspect compatibility
@@ -33,8 +33,8 @@ See the [Tasks and TaskSets Concepts](/concepts/task) page for detailed explanat
33
33
 
34
34
  * **`id` (str | None):** Optional unique identifier, often assigned when loaded from the HUD platform.
35
35
  * **`prompt` (str):** The main instruction or goal for the agent.
36
- * **`setup` (`HudStyleConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
37
- * **`evaluate` (`HudStyleConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
36
+ * **`setup` (`FunctionConfigs` | None):** Configuration for setup actions executed before the agent starts. See [Configuration Styles](/concepts/task#configuration-styles).
37
+ * **`evaluate` (`FunctionConfigs` | None):** Configuration defining the evaluation logic executed by `env.evaluate()`. See [Configuration Styles](/concepts/task#configuration-styles).
38
38
  * **`gym` (`Gym` | None):** Specifies the required environment type (e.g., `"hud-browser"`, `CustomGym` object). See `hud.types`.
39
39
  * **`target` (str | list[str] | None):** Ideal target output (primarily for compatibility with `inspect-ai`).
40
40
  * **`choices` (list[str] | None):** Multiple choice options (primarily for compatibility with `inspect-ai`).
@@ -46,7 +46,21 @@ env_os = await gym.make("OSWorld-Ubuntu")
46
46
  # await env_os.close()
47
47
  ```
48
48
 
49
- Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@job` decorator.
49
+ Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@register_job` decorator.
50
+
51
+ ## Available Environment Types
52
+
53
+ The HUD SDK provides several standard environment types, specified via the `gym` attribute in a [Task](/concepts/task) or directly in `hud.gym.make()`:
54
+
55
+ * **`"hud-browser"`**: Provides a remote Chromium browser instance managed via Playwright. Ideal for web navigation, form interaction, and testing web applications.
56
+ * [See `hud-browser` Details](../environments/hud-browser.mdx)
57
+ * **`"hud-ubuntu"`**: Provides a remote Ubuntu desktop environment accessed via VNC. Suitable for tasks involving GUI applications, file system interaction, or running Linux software.
58
+ * [See `hud-ubuntu` Details](../environments/hud-ubuntu.mdx)
59
+ * **`"qa"`**: A non-interactive environment for question-answering tasks where the agent provides a direct textual response.
60
+ * [See `qa` Environment Details](../environments/qa.mdx)
61
+ * **`CustomGym`**: Allows defining and running your own [Custom Environments](../advanced/custom-environments.mdx) using Docker, either locally or remotely. This provides maximum flexibility for specific testing needs.
62
+
63
+ The `gym` attribute in a Task tells `hud.gym.make()` which environment to instantiate.
50
64
 
51
65
  ## Interaction Loop
52
66
 
@@ -78,10 +92,10 @@ for _ in range(10):
78
92
  ## Key Methods
79
93
 
80
94
  * **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
81
- * **`env.evaluate(config: HudStyleConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
95
+ * **`env.evaluate(config: FunctionConfigs | None = None)`**: Runs evaluation logic defined in the [Task](/concepts/task) (or the provided `config`). Returns evaluation result.
82
96
  * **`env.close()`**: Shuts down the environment. Saves the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
83
97
  * **`env.get_urls()`**: Returns URLs (`url`, `live_url`) for accessing/viewing the environment.
84
- * **`env.reset(configs: HudStyleConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
98
+ * **`env.reset(configs: FunctionConfigs | None = None)`**: Resets state, often running setup steps. *Mostly used internally or for environments created without an initial Task.*
85
99
  * **`env._setup(...)` / `env._invoke_all(...)`**: Internal methods for running setup/evaluate/custom configurations defined in a [Task](/concepts/task).
86
100
 
87
101
  ## Observations
@@ -96,5 +110,5 @@ The `Observation` object returned by `env.step()` contains:
96
110
  * [Task](/concepts/task): Defines the environment type (`gym`), `setup`, and `evaluate` logic.
97
111
  * [Agent](/concepts/agent): Interacts with the Environment via the `step` and `predict` methods.
98
112
  * [Adapter](/concepts/adapter): Ensures actions passed to `step` are in the correct `CLA` format.
99
- * [Job](/concepts/job): Groups environment runs; linking happens via `@job` or `gym.make(job=...)`.
113
+ * [Job](/concepts/job): Groups environment runs; linking happens via `@register_job` or `gym.make(job=...)`.
100
114
  * [Trajectory](/concepts/trajectory): The recording generated when a job-linked environment is closed.
@@ -18,16 +18,16 @@ Jobs help organize evaluation data, useful for:
18
18
 
19
19
  ## Creating Jobs
20
20
 
21
- ### 1. The `@job` Decorator (Recommended)
21
+ ### 1. The `@register_job` Decorator (Recommended)
22
22
 
23
23
  Decorate an `async` function. A new Job is created per function call, and any environments created within using `hud.gym.make()` are automatically linked.
24
24
 
25
25
  ```python
26
- from hud import gym, job
26
+ from hud import gym, register_job
27
27
  from hud.task import Task
28
28
  from hud.agent import OperatorAgent # Example agent
29
29
 
30
- @job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
30
+ @register_job(name="my-evaluation-run", metadata={"agent_version": "1.1"})
31
31
  async def run_evaluation():
32
32
  task = Task(prompt="Example", gym="hud-browser")
33
33
  env = await gym.make(task) # Linked to "my-evaluation-run" job
@@ -89,7 +89,7 @@ async def analyze_job(job_id: str):
89
89
 
90
90
  ## Best Practices
91
91
 
92
- * Use `@job` for most scripts.
92
+ * Use `@register_job` for most scripts.
93
93
  * Use descriptive names and metadata.
94
94
  * Create separate jobs for distinct experiments.
95
95
 
@@ -15,8 +15,8 @@ A `Task` object provides the configuration for a specific scenario.
15
15
 
16
16
  * **`prompt` (str):** The primary instruction given to the agent.
17
17
  * **`gym` (str | `CustomGym` | None):** Specifies the type of [Environment](/concepts/environment) needed. Used by `hud.gym.make()`.
18
- * **`setup` (`HudStyleConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
19
- * **`evaluate` (`HudStyleConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
18
+ * **`setup` (`FunctionConfigs` | None):** Defines actions executed *before* the agent starts. See [Setup Configuration](#setup-configuration).
19
+ * **`evaluate` (`FunctionConfigs` | None):** Defines how to check if the agent succeeded *after* interaction. See [Evaluation Configuration](#evaluation-configuration).
20
20
  * **`id` (str | None):** Optional identifier.
21
21
  * **`metadata` (dict | None):** Optional dictionary for extra information.
22
22
  * **`config` (dict | None):** Optional dictionary, primarily for remote execution.
@@ -30,11 +30,10 @@ task = Task(
30
30
  prompt="Log in to example.com with username 'test'",
31
31
  gym="hud-browser", # Request a browser environment
32
32
  setup=[ # Actions run by gym.make(task)
33
- ("goto", "https://example.com/login"),
34
- {"function": "wait_for_element", "args": ["#username"]}
33
+ ("goto", "https://example.com/login")
35
34
  ],
36
35
  evaluate={ # Logic run by env.evaluate()
37
- "function": "check_login_status",
36
+ "function": "page_contains",
38
37
  "args": ["test"]
39
38
  }
40
39
  )
@@ -42,7 +41,7 @@ task = Task(
42
41
 
43
42
  ### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
44
43
 
45
- Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`HudStyleConfigs`):
44
+ Both `setup` and `evaluate` accept configurations defining function calls within the environment's controller, using flexible formats (`FunctionConfigs`):
46
45
 
47
46
  1. **String:** `"browser.maximize"`
48
47
  2. **Tuple:** `("goto", "https://google.com")`
@@ -82,11 +81,13 @@ Load predefined sets from the HUD platform:
82
81
  ```python
83
82
  from hud import load_taskset
84
83
 
85
- taskset = await load_taskset("OSWorld-Ubuntu-Links")
84
+ taskset = await load_taskset("OSWorld-Ubuntu")
86
85
  print(f"Number of tasks: {len(taskset)}") # TaskSet acts like a list
87
86
  first_task = taskset[0]
88
87
  ```
89
88
 
89
+ Currently supported TaskSets available via `load_taskset` include OSWorld, GAIA, and WebVoyager subsets.
90
+
90
91
  ### Creating a TaskSet Manually
91
92
 
92
93
  ```python
@@ -29,6 +29,14 @@
29
29
  "concepts/trajectory"
30
30
  ]
31
31
  },
32
+ {
33
+ "group": "Environments",
34
+ "pages": [
35
+ "environments/hud-browser",
36
+ "environments/hud-ubuntu",
37
+ "environments/qa"
38
+ ]
39
+ },
32
40
  {
33
41
  "group": "Advanced Topics",
34
42
  "pages": [
@@ -0,0 +1,67 @@
1
+ # HUD Browser Environment
2
+
3
+ ## Introduction
4
+
5
+ The `hud-browser` environment provides a remote Chromium browser instance, managed by Playwright, for agents to interact with websites. It's ideal for tasks involving web navigation, form filling, information retrieval, and testing web applications.
6
+
7
+ ## Setup
8
+
9
+ Setup actions for the `hud-browser` are defined in the `setup` attribute of a [Task](../concepts/task.mdx) and executed by `hud.gym.make()`. They typically involve browser controller functions.
10
+
11
+ * **`goto(url: str)`**: Navigates the browser to the specified `url`. Automatically prepends `http://` if no scheme is provided. Waits for `domcontentloaded` (up to 10s timeout) and adds a 1s wait for rendering.
12
+ ```python
13
+ # Example Task Setup:
14
+ setup=[("goto", "https://google.com")]
15
+ ```
16
+ * **Other common setup functions coming soon:** `wait_for_element`, `click`, `type`, `set_cookies` etc.
17
+
18
+ Refer to [Task Setup Configuration](../concepts/task.mdx#setup-configuration) for how to define these.
19
+
20
+ ## Step Interaction
21
+
22
+ Agents interact with the browser environment by sending a list of [CLA Actions](../advanced/cla-details.mdx) to `env.step()`. An [Adapter](../concepts/adapter.mdx) typically handles the conversion from the agent model's output to the CLA format.
23
+
24
+ Common CLAs used with `hud-browser`:
25
+ * [`ClickAction`](../advanced/cla-details.mdx#mouse-actions)
26
+ * [`MoveAction`](../advanced/cla-details.mdx#mouse-actions)
27
+ * [`TypeAction`](../advanced/cla-details.mdx#keyboard-actions)
28
+ * [`PressAction`](../advanced/cla-details.mdx#keyboard-actions)
29
+ * [`ScrollAction`](../advanced/cla-details.mdx#mouse-actions)
30
+ * [`DragAction`](../advanced/cla-details.mdx#mouse-actions)
31
+ * [`ResponseAction`](../advanced/cla-details.mdx#response-actions) (to submit a final text answer)
32
+
33
+ *See [CLA Action Details](../advanced/cla-details.mdx) for the full specification.*
34
+
35
+ ## Evaluate
36
+
37
+ The `evaluate` attribute of a [Task](../concepts/task.mdx) defines how success is measured using `env.evaluate()`. This calls functions within the browser controller.
38
+
39
+ Built-in evaluation functions for `hud-browser`:
40
+
41
+ * **`url_match(expected_url: str)`**: Checks if the current browser URL exactly matches `expected_url`. Returns `1.0` for a match, `0.0` otherwise.
42
+ ```python
43
+ # Example Task Evaluation:
44
+ evaluate=("url_match", "https://google.com/search?q=expected")
45
+ ```
46
+ * **`page_contains(texts: list[str])`** (alias `contains_text`): Checks if *all* strings in `texts` are present in `page.content()`. Returns `1.0` if all texts are found, `0.0` otherwise.
47
+ ```python
48
+ # Example Task Evaluation:
49
+ evaluate=("page_contains", ["Search Results", "About 1,000,000 results"])
50
+ ```
51
+ * **`sheet_contains(texts: list[str])`**: Custom function for Google Sheets. Returns `1.0` if any text is found, `0.0` otherwise.
52
+ ```python
53
+ # Example Task Evaluation:
54
+ evaluate=("sheet_contains", ["Expected value in cell A1"])
55
+ ```
56
+ * **`cookie_exists(cookie_names: list[str])`**: Checks if all cookies in `cookie_names` exist in `context.cookies()`. Returns `1.0` if all exist, `0.0` otherwise.
57
+ ```python
58
+ # Example Task Evaluation:
59
+ evaluate=("cookie_exists", ["session_id", "user_pref"])
60
+ ```
61
+ * **`cookie_match(name_value_pairs: list[str])`**: Checks if cookies exist *and* match expected values. `name_value_pairs` format: `[name1, value1, name2, value2, ...]`. Returns `1.0` if all match, `0.0` otherwise.
62
+ ```python
63
+ # Example Task Evaluation:
64
+ evaluate=("cookie_match", ["user_id", "12345", "theme", "dark"])
65
+ ```
66
+
67
+ Refer to [Task Evaluation Configuration](../concepts/task.mdx#evaluation-configuration) for more details.