hud-python 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.2.0 → hud_python-0.2.1}/PKG-INFO +19 -26
- {hud_python-0.2.0 → hud_python-0.2.1}/README.md +16 -23
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/environment.mdx +3 -2
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/task.mdx +32 -2
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/docs.json +5 -4
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/quickstart.mdx +3 -9
- hud_python-0.2.1/docs/running-your-agent.mdx +237 -0
- hud_python-0.2.1/examples/browser_use.ipynb +119 -0
- hud_python-0.2.1/examples/inspect.ipynb +169 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/examples/jobs.ipynb +4 -6
- {hud_python-0.2.0 → hud_python-0.2.1}/examples/local.ipynb +7 -3
- {hud_python-0.2.0 → hud_python-0.2.1}/examples/osworld.ipynb +3 -3
- hud_python-0.2.1/examples/tasks.ipynb +257 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/__init__.py +1 -1
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/claude/adapter.py +9 -1
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/types.py +7 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/operator/adapter.py +4 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/claude.py +22 -2
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/operator.py +35 -17
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/docker_client.py +1 -1
- hud_python-0.2.1/hud/env/environment.py +354 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/local_docker_client.py +3 -1
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/task.py +41 -30
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/taskset.py +8 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/common.py +28 -1
- hud_python-0.2.1/hud/utils/config.py +94 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/pyproject.toml +3 -3
- {hud_python-0.2.0 → hud_python-0.2.1}/tests/test_import.py +1 -1
- hud_python-0.2.0/examples/browser_use.ipynb +0 -324
- hud_python-0.2.0/examples/tasks.ipynb +0 -117
- hud_python-0.2.0/hud/env/environment.py +0 -181
- hud_python-0.2.0/hud/utils/config.py +0 -185
- {hud_python-0.2.0 → hud_python-0.2.1}/.env.example +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/.github/workflows/ci.yml +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/.github/workflows/release.yml +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/.gitignore +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/LICENSE +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/MANIFEST.in +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/cla-details.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/custom-environments.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/environment-control.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api/reference/adapters.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/env.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/gym.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/job.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/task.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/taskset.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/trajectory.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/agent.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/job.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/trajectory.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/basic.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/claude-agent.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/custom-agent.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/favicon.png +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/installation.mdx +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/logo/HUD-light-optimized.svg +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/docs/logo/HUD.svg +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/Dockerfile +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/pyproject.toml +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/Dockerfile +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/pyproject.toml +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/info.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/step.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/examples/README.md +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/adapter.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/operator/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/base.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/client.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/remote_client.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/remote_docker_client.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/base.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/inspect.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/judge.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/match.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/remote.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/gym.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/job.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/py.typed +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/server/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/server/requests.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/settings.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/trajectory.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/types.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/__init__.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/telemetry.py +0 -0
- {hud_python-0.2.0 → hud_python-0.2.1}/tests/__init__.py +0 -0
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
|
-
Project-URL: Homepage, https://github.com/
|
|
6
|
-
Project-URL: Bug Tracker, https://github.com/
|
|
5
|
+
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
7
7
|
Project-URL: Documentation, https://hud.so
|
|
8
8
|
Author-email: Human Union Data SDK <founders@hud.so>
|
|
9
9
|
License: MIT License
|
|
@@ -57,7 +57,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
|
57
57
|
Requires-Dist: ruff==0.9.8; extra == 'dev'
|
|
58
58
|
Description-Content-Type: text/markdown
|
|
59
59
|
|
|
60
|
-
# HUD
|
|
60
|
+
# HUD
|
|
61
61
|
|
|
62
62
|
A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
|
|
63
63
|
|
|
@@ -86,21 +86,20 @@ export HUD_API_KEY=your_api_key_here
|
|
|
86
86
|
pip install hud-python
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
-
### Simple Browser Example with
|
|
89
|
+
### Simple Browser Example with Claude Computer Use
|
|
90
90
|
|
|
91
91
|
> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
92
92
|
|
|
93
|
+
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
94
|
+
|
|
93
95
|
```python
|
|
94
|
-
import os
|
|
95
96
|
import asyncio
|
|
96
97
|
from hud import gym, job
|
|
97
98
|
from hud.task import Task
|
|
98
|
-
from hud.
|
|
99
|
-
from hud.agent import OperatorAgent
|
|
99
|
+
from hud.agent import ClaudeAgent
|
|
100
100
|
|
|
101
101
|
@job("test-run")
|
|
102
102
|
async def main():
|
|
103
|
-
# Define a simple task
|
|
104
103
|
task = Task(
|
|
105
104
|
prompt="Insert the text 'capybara' into the search bar",
|
|
106
105
|
gym="hud-browser",
|
|
@@ -108,26 +107,20 @@ async def main():
|
|
|
108
107
|
evaluate=("contains_text", "capybara")
|
|
109
108
|
)
|
|
110
109
|
|
|
111
|
-
# Create environment
|
|
110
|
+
# Create environment using the gym module
|
|
112
111
|
env = await gym.make(task)
|
|
113
112
|
|
|
114
|
-
# Get URLs and display live view (optional)
|
|
115
|
-
# urls = await env.get_urls()
|
|
116
|
-
# stream(urls["live_url"])
|
|
117
|
-
|
|
118
113
|
# Initialize Operator agent (API key is loaded automatically)
|
|
119
|
-
agent =
|
|
114
|
+
agent = ClaudeAgent()
|
|
120
115
|
|
|
121
|
-
# Agent loop
|
|
122
|
-
obs, _ = env.reset()
|
|
116
|
+
# Agent loop with predict and step functions
|
|
117
|
+
obs, _ = await env.reset() # Gets first observation
|
|
123
118
|
for i in range(5):
|
|
124
119
|
actions, done = await agent.predict(obs)
|
|
125
120
|
if done:
|
|
126
121
|
break
|
|
127
122
|
|
|
128
123
|
obs, reward, terminated, info = await env.step(actions)
|
|
129
|
-
if terminated:
|
|
130
|
-
break
|
|
131
124
|
|
|
132
125
|
# Evaluate and close
|
|
133
126
|
result = await env.evaluate()
|
|
@@ -143,26 +136,26 @@ if __name__ == "__main__":
|
|
|
143
136
|
|
|
144
137
|
Explore the core concepts and features of the SDK:
|
|
145
138
|
|
|
146
|
-
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
|
|
139
|
+
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
|
|
147
140
|
* **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
|
|
148
141
|
* **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
149
142
|
* **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
|
|
150
143
|
* **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
151
144
|
* **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
|
|
152
145
|
* **Advanced Topics**:
|
|
146
|
+
* **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
|
|
153
147
|
* **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
|
|
154
148
|
* **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
|
|
155
|
-
* **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
|
|
156
149
|
|
|
157
150
|
* **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
158
151
|
|
|
159
152
|
## [Examples](examples/)
|
|
160
153
|
|
|
161
|
-
We
|
|
154
|
+
We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
|
|
162
155
|
|
|
163
156
|
1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
|
|
164
157
|
2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
|
|
165
|
-
3. [OSWorld](examples/osworld.ipynb) -
|
|
158
|
+
3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
|
|
166
159
|
4. [Local Development](examples/local.ipynb) - Setting up local custom environments
|
|
167
160
|
|
|
168
161
|
## Documentation
|
|
@@ -180,9 +173,9 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
180
173
|
```bibtex
|
|
181
174
|
@software{hud2025agentevalplatform,
|
|
182
175
|
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
183
|
-
title = {{HUD: An Evaluation Platform for
|
|
184
|
-
date = {2025-
|
|
185
|
-
url = {https://github.com/
|
|
176
|
+
title = {{HUD: An Evaluation Platform for Agents}},
|
|
177
|
+
date = {2025-04},
|
|
178
|
+
url = {https://github.com/hud-evals/hud-sdk},
|
|
186
179
|
langid = {en}
|
|
187
180
|
}
|
|
188
181
|
```
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# HUD
|
|
1
|
+
# HUD
|
|
2
2
|
|
|
3
3
|
A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
|
|
4
4
|
|
|
@@ -27,21 +27,20 @@ export HUD_API_KEY=your_api_key_here
|
|
|
27
27
|
pip install hud-python
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
-
### Simple Browser Example with
|
|
30
|
+
### Simple Browser Example with Claude Computer Use
|
|
31
31
|
|
|
32
32
|
> This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
|
|
33
33
|
|
|
34
|
+
Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
|
|
35
|
+
|
|
34
36
|
```python
|
|
35
|
-
import os
|
|
36
37
|
import asyncio
|
|
37
38
|
from hud import gym, job
|
|
38
39
|
from hud.task import Task
|
|
39
|
-
from hud.
|
|
40
|
-
from hud.agent import OperatorAgent
|
|
40
|
+
from hud.agent import ClaudeAgent
|
|
41
41
|
|
|
42
42
|
@job("test-run")
|
|
43
43
|
async def main():
|
|
44
|
-
# Define a simple task
|
|
45
44
|
task = Task(
|
|
46
45
|
prompt="Insert the text 'capybara' into the search bar",
|
|
47
46
|
gym="hud-browser",
|
|
@@ -49,26 +48,20 @@ async def main():
|
|
|
49
48
|
evaluate=("contains_text", "capybara")
|
|
50
49
|
)
|
|
51
50
|
|
|
52
|
-
# Create environment
|
|
51
|
+
# Create environment using the gym module
|
|
53
52
|
env = await gym.make(task)
|
|
54
53
|
|
|
55
|
-
# Get URLs and display live view (optional)
|
|
56
|
-
# urls = await env.get_urls()
|
|
57
|
-
# stream(urls["live_url"])
|
|
58
|
-
|
|
59
54
|
# Initialize Operator agent (API key is loaded automatically)
|
|
60
|
-
agent =
|
|
55
|
+
agent = ClaudeAgent()
|
|
61
56
|
|
|
62
|
-
# Agent loop
|
|
63
|
-
obs, _ = env.reset()
|
|
57
|
+
# Agent loop with predict and step functions
|
|
58
|
+
obs, _ = await env.reset() # Gets first observation
|
|
64
59
|
for i in range(5):
|
|
65
60
|
actions, done = await agent.predict(obs)
|
|
66
61
|
if done:
|
|
67
62
|
break
|
|
68
63
|
|
|
69
64
|
obs, reward, terminated, info = await env.step(actions)
|
|
70
|
-
if terminated:
|
|
71
|
-
break
|
|
72
65
|
|
|
73
66
|
# Evaluate and close
|
|
74
67
|
result = await env.evaluate()
|
|
@@ -84,26 +77,26 @@ if __name__ == "__main__":
|
|
|
84
77
|
|
|
85
78
|
Explore the core concepts and features of the SDK:
|
|
86
79
|
|
|
87
|
-
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
|
|
80
|
+
* **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
|
|
88
81
|
* **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
|
|
89
82
|
* **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
|
|
90
83
|
* **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
|
|
91
84
|
* **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
|
|
92
85
|
* **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
|
|
93
86
|
* **Advanced Topics**:
|
|
87
|
+
* **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
|
|
94
88
|
* **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
|
|
95
89
|
* **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
|
|
96
|
-
* **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
|
|
97
90
|
|
|
98
91
|
* **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
|
|
99
92
|
|
|
100
93
|
## [Examples](examples/)
|
|
101
94
|
|
|
102
|
-
We
|
|
95
|
+
We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
|
|
103
96
|
|
|
104
97
|
1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
|
|
105
98
|
2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
|
|
106
|
-
3. [OSWorld](examples/osworld.ipynb) -
|
|
99
|
+
3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
|
|
107
100
|
4. [Local Development](examples/local.ipynb) - Setting up local custom environments
|
|
108
101
|
|
|
109
102
|
## Documentation
|
|
@@ -121,9 +114,9 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
121
114
|
```bibtex
|
|
122
115
|
@software{hud2025agentevalplatform,
|
|
123
116
|
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
|
|
124
|
-
title = {{HUD: An Evaluation Platform for
|
|
125
|
-
date = {2025-
|
|
126
|
-
url = {https://github.com/
|
|
117
|
+
title = {{HUD: An Evaluation Platform for Agents}},
|
|
118
|
+
date = {2025-04},
|
|
119
|
+
url = {https://github.com/hud-evals/hud-sdk},
|
|
127
120
|
langid = {en}
|
|
128
121
|
}
|
|
129
122
|
```
|
|
@@ -67,13 +67,14 @@ obs, _ = await env.reset()
|
|
|
67
67
|
for _ in range(10):
|
|
68
68
|
# 2. Agent predicts action(s)
|
|
69
69
|
actions, done = await agent.predict(obs)
|
|
70
|
-
if done: break
|
|
71
70
|
|
|
72
71
|
# 3. Execute action(s) in environment
|
|
73
72
|
obs, reward, terminated, info = await env.step(actions)
|
|
74
|
-
if terminated: break
|
|
73
|
+
if done or terminated: break
|
|
75
74
|
```
|
|
76
75
|
|
|
76
|
+
* **Note on QA Tasks:** For [Question-Answering Tasks](/concepts/task#defining-question-answering-qa-tasks), the agent might only need one `predict` call. The agent should output a `ResponseAction`, which the environment stores. The subsequent `env.evaluate()` call then checks this stored response. The environment itself remains largely passive for QA.
|
|
77
|
+
|
|
77
78
|
## Key Methods
|
|
78
79
|
|
|
79
80
|
* **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
|
|
@@ -60,7 +60,10 @@ Both `setup` and `evaluate` accept configurations defining function calls within
|
|
|
60
60
|
* **Purpose:** Determines task success after the agent finishes.
|
|
61
61
|
* **Execution:** Triggered by `await env.evaluate()`.
|
|
62
62
|
* **Result:** The return value of `env.evaluate()`, often a reward score (e.g., `1.0` or `0.0`). This is stored in the `reward` field of the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
|
|
63
|
-
* **Examples:**
|
|
63
|
+
* **Examples:**
|
|
64
|
+
* Interactive: `("contains_text", "Success!")`, `("file_exists", "/path/to/output.txt")`. These typically call functions *within* the active environment controller.
|
|
65
|
+
* QA: `("response_includes", "Paris")`. These functions often check the text stored in `env.final_response` (which comes from the agent's `ResponseAction`).
|
|
66
|
+
* **Note:** Check specific environment or evaluation service documentation for available functions.
|
|
64
67
|
|
|
65
68
|
## TaskSet
|
|
66
69
|
|
|
@@ -99,4 +102,31 @@ my_taskset = TaskSet(tasks=[task1, task2], description="My set")
|
|
|
99
102
|
* [Environment](/concepts/environment): Where Tasks are executed and evaluated.
|
|
100
103
|
* [Agent](/concepts/agent): Aims to complete the Task `prompt`.
|
|
101
104
|
* [Job](/concepts/job): Groups runs of different Tasks.
|
|
102
|
-
* [Trajectory](/concepts/trajectory): Records the execution of a Task.
|
|
105
|
+
* [Trajectory](/concepts/trajectory): Records the execution of a Task.
|
|
106
|
+
|
|
107
|
+
### Defining Question-Answering (QA) Tasks
|
|
108
|
+
|
|
109
|
+
While HUD excels at interactive tasks, you can also define tasks that are primarily question-answering. The key differences are:
|
|
110
|
+
|
|
111
|
+
* **`gym`:** You might still use an existing environment type like `"hud-browser"` if you want the QA to happen *within* that context (e.g., asking the agent to answer based on a webpage). For pure QA without environment interaction, a future specific `"qa"` gym type might be introduced, but currently, you'd use an existing type.
|
|
112
|
+
* **`prompt`:** Contains the question for the agent.
|
|
113
|
+
* **`setup`:** Often minimal or unnecessary for pure QA.
|
|
114
|
+
* **`evaluate`:** Defines how to check the agent's final text answer. This typically involves calling a specific evaluation function that compares the agent's final submitted response (see `ResponseAction` in [CLA Details](/advanced/cla-details)) against expected criteria. The `env.final_response` attribute holds the text submitted by the agent via `ResponseAction`.
|
|
115
|
+
* **`target`:** (Recommended) Store the ground truth answer in the `metadata` or potentially a dedicated `target` field for clarity during evaluation function design.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from hud.task import Task
|
|
119
|
+
|
|
120
|
+
qa_task = Task(
|
|
121
|
+
prompt="What is the powerhouse of the cell?",
|
|
122
|
+
gym="hud-browser", # Or potentially a future "qa" type
|
|
123
|
+
# No complex setup needed for pure QA
|
|
124
|
+
setup=(),
|
|
125
|
+
# Evaluation checks the agent's final submitted text response
|
|
126
|
+
evaluate=("response_includes", "mitochondria"), # Assumes a function checking env.final_response
|
|
127
|
+
)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
The [Agent](/concepts/agent) handling such a task should recognize it doesn't need complex interaction and output a `ResponseAction` containing the final answer. The `env.evaluate()` call then triggers the specified check (like `response_includes`) against the stored response.
|
|
131
|
+
|
|
132
|
+
### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"group": "Getting Started",
|
|
15
15
|
"pages": [
|
|
16
16
|
"quickstart",
|
|
17
|
+
"running-your-agent",
|
|
17
18
|
"installation"
|
|
18
19
|
]
|
|
19
20
|
},
|
|
@@ -31,9 +32,9 @@
|
|
|
31
32
|
{
|
|
32
33
|
"group": "Advanced Topics",
|
|
33
34
|
"pages": [
|
|
35
|
+
"advanced/cla-details",
|
|
34
36
|
"advanced/custom-environments",
|
|
35
|
-
"advanced/environment-control"
|
|
36
|
-
"advanced/cla-details"
|
|
37
|
+
"advanced/environment-control"
|
|
37
38
|
]
|
|
38
39
|
},
|
|
39
40
|
{
|
|
@@ -59,13 +60,13 @@
|
|
|
59
60
|
"links": [
|
|
60
61
|
{
|
|
61
62
|
"label": "GitHub",
|
|
62
|
-
"href": "https://github.com/
|
|
63
|
+
"href": "https://github.com/hud-evals/hud-sdk"
|
|
63
64
|
}
|
|
64
65
|
]
|
|
65
66
|
},
|
|
66
67
|
"footer": {
|
|
67
68
|
"socials": {
|
|
68
|
-
"github": "https://github.com/
|
|
69
|
+
"github": "https://github.com/hud-evals/hud-sdk",
|
|
69
70
|
"website": "https://hud.so"
|
|
70
71
|
}
|
|
71
72
|
}
|
|
@@ -31,7 +31,7 @@ The SDK automatically loads API keys from environment variables or a `.env` file
|
|
|
31
31
|
|
|
32
32
|
Example `.env` file:
|
|
33
33
|
```
|
|
34
|
-
HUD_API_KEY=
|
|
34
|
+
HUD_API_KEY=sk-hud-...
|
|
35
35
|
OPENAI_API_KEY=sk-...
|
|
36
36
|
# ANTHROPIC_API_KEY=sk-ant-...
|
|
37
37
|
```
|
|
@@ -79,16 +79,10 @@ async def main():
|
|
|
79
79
|
actions, done = await agent.predict(obs)
|
|
80
80
|
print(f"Agent action(s): {actions}")
|
|
81
81
|
|
|
82
|
-
if done:
|
|
83
|
-
print("Agent signaled task completion.")
|
|
84
|
-
break
|
|
85
|
-
|
|
86
82
|
# Execute the action(s) in the environment
|
|
87
83
|
obs, reward, terminated, info = await env.step(actions)
|
|
88
84
|
|
|
89
|
-
if terminated:
|
|
90
|
-
print("Environment terminated.")
|
|
91
|
-
break
|
|
85
|
+
if done or terminated: break # Agent signaled task completion or environment terminated
|
|
92
86
|
|
|
93
87
|
# 5. Evaluate & Close
|
|
94
88
|
print("Evaluating task...")
|
|
@@ -127,4 +121,4 @@ if __name__ == "__main__":
|
|
|
127
121
|
|
|
128
122
|
* Explore the [Core Concepts](/concepts/environment) to understand the SDK architecture in more detail.
|
|
129
123
|
* Check out the [Examples folder in the GitHub repo](/examples/) for more detailed, runnable notebooks covering different agents and environments.
|
|
130
|
-
* Review the [API Reference](/api-reference/gym) for comprehensive documentation on specific functions and classes.
|
|
124
|
+
* Review the [API Reference](/api-reference/gym) for comprehensive documentation on specific functions and classes.
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: 'Running Your Own Agent'
|
|
3
|
+
description: 'Integrating custom agent logic with HUD environments'
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Running Your Own Agent
|
|
7
|
+
|
|
8
|
+
The HUD SDK is designed to be flexible, allowing you to integrate various types of AI agents. While the SDK provides built-in agents (`ClaudeAgent`, `OperatorAgent`), you can easily run your own custom agent logic. This guide outlines the primary ways to achieve this.
|
|
9
|
+
|
|
10
|
+
The core interaction loop with any HUD [Environment](/concepts/environment) involves:
|
|
11
|
+
1. Creating the environment: `env = await hud.gym.make(...)`
|
|
12
|
+
2. Getting an initial observation: `obs, _ = await env.reset()`
|
|
13
|
+
3. **Agent Decision:** Processing `obs` to decide on the next action(s).
|
|
14
|
+
4. Executing actions: `obs, _, _, _ = await env.step(actions)`
|
|
15
|
+
5. Evaluating the outcome: `result = await env.evaluate()`
|
|
16
|
+
6. Closing the environment: `await env.close()`
|
|
17
|
+
|
|
18
|
+
The key difference lies in how **Step 3 (Agent Decision)** is implemented and how the resulting `actions` are formatted for **Step 4**.
|
|
19
|
+
|
|
20
|
+
## Approach 1: Direct CLA Interaction
|
|
21
|
+
|
|
22
|
+
This is the most straightforward approach if your agent logic can directly generate actions conforming to HUD's **Common Language Actions (CLA)** format. See [CLA Action Details](/advanced/cla-details) for format specifics.
|
|
23
|
+
|
|
24
|
+
* **Concept:** Your agent code, running outside the HUD `Agent` class structure, processes the `Observation` and directly constructs a list of `CLA` objects.
|
|
25
|
+
* **Implementation:**
|
|
26
|
+
* Focus on your agent's decision-making process based on `obs.screenshot` and `obs.text`.
|
|
27
|
+
* Your agent's output must be `list[CLA]`. You'll need to import specific `CLA` types (like `ClickAction`, `TypeAction`, etc.) from `hud.adapters.common.types`.
|
|
28
|
+
* Pass this list directly to `env.step()`.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import asyncio
|
|
32
|
+
from hud import gym, job
|
|
33
|
+
from hud.task import Task
|
|
34
|
+
from hud.env import Observation
|
|
35
|
+
# Import specific CLA types you need
|
|
36
|
+
from hud.adapters import CLA
|
|
37
|
+
from hud.adapters.common.types import ClickAction, TypeAction, Point
|
|
38
|
+
|
|
39
|
+
# --- Your Custom Agent Logic ---
|
|
40
|
+
def my_custom_agent_logic(observation: Observation) -> list[CLA]:
|
|
41
|
+
# Process screenshot/text...
|
|
42
|
+
# Decide on next actions...
|
|
43
|
+
# Example: Click at (100, 150) and type "hello"
|
|
44
|
+
actions = [
|
|
45
|
+
ClickAction(point=Point(x=100, y=150)),
|
|
46
|
+
TypeAction(text="hello")
|
|
47
|
+
]
|
|
48
|
+
# Ensure the return type is list[CLA]
|
|
49
|
+
return actions
|
|
50
|
+
|
|
51
|
+
@job("custom-cla-agent-run")
|
|
52
|
+
async def main():
|
|
53
|
+
task = Task(prompt="Click and type", gym="hud-browser")
|
|
54
|
+
env = await gym.make(task)
|
|
55
|
+
obs, _ = await env.reset() # Initial observation
|
|
56
|
+
|
|
57
|
+
for i in range(5):
|
|
58
|
+
print(f"--- Step {i+1} ---")
|
|
59
|
+
# Get actions directly from your logic
|
|
60
|
+
agent_actions: list[CLA] = my_custom_agent_logic(obs)
|
|
61
|
+
print(f"Agent actions: {agent_actions}")
|
|
62
|
+
|
|
63
|
+
# Step the environment with CLA actions
|
|
64
|
+
obs, _, _, terminated, info = await env.step(agent_actions)
|
|
65
|
+
|
|
66
|
+
if terminated: break # Check termination
|
|
67
|
+
|
|
68
|
+
result = await env.evaluate()
|
|
69
|
+
print(f"Evaluation: {result}")
|
|
70
|
+
await env.close()
|
|
71
|
+
|
|
72
|
+
# if __name__ == "__main__":
|
|
73
|
+
# asyncio.run(main())
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
* **Pros:** Simple integration, doesn't require understanding the `Agent`/`Adapter` inheritance structure.
|
|
77
|
+
* **Cons:** Your agent logic needs to be aware of and construct specific `CLA` Pydantic models. No automatic observation preprocessing (like screenshot rescaling) or action postprocessing (like coordinate rescaling) provided by the `Adapter` framework.
|
|
78
|
+
|
|
79
|
+
## Approach 2: Inheriting `hud.agent.Agent`
|
|
80
|
+
|
|
81
|
+
This approach leverages the SDK's structure for a more integrated solution.
|
|
82
|
+
|
|
83
|
+
* **Concept:** Create a class that inherits from `hud.agent.Agent`. Implement the `fetch_response` method to contain your core agent logic (calling your model, processing results). Optionally, create a custom `hud.adapters.Adapter` if your model uses a non-standard action format or requires specific observation rescaling.
|
|
84
|
+
* **Implementation:**
|
|
85
|
+
* Define `MyAgent(Agent[MyClientType, MyRawActionType])`.
|
|
86
|
+
* Implement `async def fetch_response(self, observation: Observation) -> tuple[list[MyRawActionType], bool]: ...`. This method should return the *raw* actions from your model and a `done` flag.
|
|
87
|
+
* (Optional) Define `MyAdapter(Adapter)` and implement `convert(self, raw_action: MyRawActionType) -> CLA: ...`. You might also override `__init__` to set `self.agent_width`/`height` if different from the default.
|
|
88
|
+
* Instantiate your agent, optionally passing your custom adapter: `agent = MyAgent(client=my_llm_client, adapter=MyAdapter())`. If you provide an adapter, the base `Agent.predict` method will automatically call `adapter.rescale` before `fetch_response` and `adapter.adapt_list` after.
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import asyncio
|
|
92
|
+
from typing import Any # Placeholder for your raw action type
|
|
93
|
+
from hud import gym, job
|
|
94
|
+
from hud.task import Task
|
|
95
|
+
from hud.env import Observation
|
|
96
|
+
from hud.agent import Agent # Import base class
|
|
97
|
+
from hud.adapters import Adapter, CLA # Import base adapter and CLA type
|
|
98
|
+
# Import your specific CLA types if needed for a custom adapter
|
|
99
|
+
from hud.adapters.common.types import ClickAction, TypeAction, Point
|
|
100
|
+
|
|
101
|
+
# --- Your Custom Agent ---
|
|
102
|
+
class MyRawAction(dict): # Example raw action type (e.g., a dictionary)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
class MyAgent(Agent[Any, MyRawAction]): # Specify client type and raw action type
|
|
106
|
+
# You might initialize your LLM client here
|
|
107
|
+
def __init__(self, adapter: Adapter | None = None): # Optionally take an adapter
|
|
108
|
+
super().__init__(client=None, adapter=adapter) # Pass adapter to base
|
|
109
|
+
|
|
110
|
+
async def fetch_response(self, observation: Observation) -> tuple[list[MyRawAction], bool]:
|
|
111
|
+
# 1. Process observation (screenshot already rescaled if adapter exists)
|
|
112
|
+
prompt = f"Image received. Task: {observation.text}. What to do?"
|
|
113
|
+
# 2. Call your custom LLM / logic
|
|
114
|
+
# llm_response = await my_llm_call(prompt, observation.screenshot)
|
|
115
|
+
llm_response = {"action_type": "click", "x": 200, "y": 250} # Dummy response
|
|
116
|
+
|
|
117
|
+
# 3. Convert LLM response to your raw action format
|
|
118
|
+
raw_actions: list[MyRawAction] = [MyRawAction(llm_response)] # Example
|
|
119
|
+
done = False # Decide if task is done
|
|
120
|
+
return raw_actions, done
|
|
121
|
+
|
|
122
|
+
# --- (Optional) Your Custom Adapter ---
|
|
123
|
+
class MyAdapter(Adapter):
|
|
124
|
+
def __init__(self):
|
|
125
|
+
super().__init__()
|
|
126
|
+
self.agent_width = 1000 # Example: If your model expects 1000px wide images
|
|
127
|
+
self.agent_height = 800
|
|
128
|
+
|
|
129
|
+
def convert(self, raw_action: MyRawAction) -> CLA:
|
|
130
|
+
# Convert your raw action dict to a CLA Pydantic model
|
|
131
|
+
if raw_action.get("action_type") == "click":
|
|
132
|
+
return ClickAction(point=Point(x=raw_action["x"], y=raw_action["y"]))
|
|
133
|
+
elif raw_action.get("action_type") == "type":
|
|
134
|
+
return TypeAction(text=raw_action.get("text", ""))
|
|
135
|
+
# ... handle other action types ...
|
|
136
|
+
raise ValueError(f"Unknown raw action type: {raw_action}")
|
|
137
|
+
|
|
138
|
+
# --- Usage ---
|
|
139
|
+
@job("custom-agent-framework-run")
|
|
140
|
+
async def main():
|
|
141
|
+
task = Task(prompt="Use custom agent", gym="hud-browser")
|
|
142
|
+
env = await gym.make(task)
|
|
143
|
+
|
|
144
|
+
# Initialize agent, optionally with the adapter
|
|
145
|
+
my_agent = MyAgent(adapter=MyAdapter()) # Adapter handles conversion + rescaling
|
|
146
|
+
|
|
147
|
+
obs, _ = await env.reset()
|
|
148
|
+
for i in range(5):
|
|
149
|
+
print(f"--- Step {i+1} ---")
|
|
150
|
+
# Predict handles preprocess, fetch_response, postprocess
|
|
151
|
+
processed_actions, done = await my_agent.predict(obs)
|
|
152
|
+
print(f"Processed CLA actions: {processed_actions}")
|
|
153
|
+
|
|
154
|
+
if done: break
|
|
155
|
+
obs, _, _, terminated, info = await env.step(processed_actions)
|
|
156
|
+
if terminated: break
|
|
157
|
+
|
|
158
|
+
result = await env.evaluate()
|
|
159
|
+
print(f"Evaluation: {result}")
|
|
160
|
+
await env.close()
|
|
161
|
+
|
|
162
|
+
# if __name__ == "__main__":
|
|
163
|
+
# asyncio.run(main())
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
* **Pros:** Leverages SDK structure, benefits from automatic rescaling (if adapter used), cleaner separation of agent logic (`fetch_response`) and action conversion (`Adapter`).
|
|
167
|
+
* **Cons:** Requires understanding the `Agent` and `Adapter` base classes.
|
|
168
|
+
|
|
169
|
+
## Approach 3: External Control (e.g., CDP)
|
|
170
|
+
|
|
171
|
+
This approach uses HUD primarily for environment provisioning and lifecycle management, while the core interaction happens via a direct connection using protocols like CDP.
|
|
172
|
+
|
|
173
|
+
* **Concept:** Use `gym.make()` to start an environment (e.g., `"hud-browser"`). Use `env.get_urls()` to retrieve connection details (like a CDP endpoint URL). Use an external library (`pyppeteer`, `playwright`, `selenium` with CDP) to connect directly to the browser instance and control it using that library's commands.
|
|
174
|
+
* **Implementation:**
|
|
175
|
+
* Create the HUD environment: `env = await gym.make(...)`.
|
|
176
|
+
* Get connection info: `urls = await env.get_urls()`, `cdp_url = urls['url']`.
|
|
177
|
+
* Initialize your external library (e.g., `pyppeteer.connect(browserURL=cdp_url)`).
|
|
178
|
+
* Use the external library's functions for interaction (e.g., `page.click()`, `page.type()`). You would likely still use `env.step()` *without actions* periodically to get updated `Observation` (screenshots) for your agent's decision-making, but you wouldn't pass actions *back* to `env.step()`.
|
|
179
|
+
* When finished, call `await env.evaluate()` and `await env.close()` on the HUD `env` object.
|
|
180
|
+
|
|
181
|
+
```python
|
|
182
|
+
import asyncio
|
|
183
|
+
import os
|
|
184
|
+
from hud import gym, job
|
|
185
|
+
from hud.task import Task
|
|
186
|
+
from hud.utils import stream # For live view
|
|
187
|
+
# Need external library, e.g., pyppeteer (pip install pyppeteer)
|
|
188
|
+
# import pyppeteer
|
|
189
|
+
|
|
190
|
+
@job("external-control-run")
|
|
191
|
+
async def main():
|
|
192
|
+
task = Task(prompt="Externally controlled task", gym="hud-browser", setup=("goto", "google.com"))
|
|
193
|
+
env = await gym.make(task)
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
urls = await env.get_urls()
|
|
197
|
+
cdp_url = urls.get('url')
|
|
198
|
+
live_url = urls.get('live_url')
|
|
199
|
+
|
|
200
|
+
if not cdp_url:
|
|
201
|
+
raise ConnectionError("Could not get CDP URL from environment.")
|
|
202
|
+
|
|
203
|
+
if live_url:
|
|
204
|
+
stream(live_url) # Show live view
|
|
205
|
+
|
|
206
|
+
print(f"Connecting via CDP: {cdp_url}")
|
|
207
|
+
# --- Connect using external library (Example: pyppeteer) ---
|
|
208
|
+
# browser = await pyppeteer.connect(browserURL=cdp_url)
|
|
209
|
+
# page = (await browser.pages())[0] # Assume first page
|
|
210
|
+
|
|
211
|
+
print("Performing actions via external library (e.g., pyppeteer)...")
|
|
212
|
+
# await page.waitForSelector('textarea[name="q"]', {'visible': True})
|
|
213
|
+
# await page.type('textarea[name="q"]', 'capybara')
|
|
214
|
+
# await page.keyboard.press('Enter')
|
|
215
|
+
# await asyncio.sleep(2) # Wait for results
|
|
216
|
+
|
|
217
|
+
# --- End external library interaction ---
|
|
218
|
+
# await browser.disconnect()
|
|
219
|
+
|
|
220
|
+
print("Evaluating task via env.evaluate()...")
|
|
221
|
+
result = await env.evaluate(("contains_text", "capybara")) # Example eval
|
|
222
|
+
print(f"Evaluation result: {result}")
|
|
223
|
+
|
|
224
|
+
finally:
|
|
225
|
+
print("Closing environment...")
|
|
226
|
+
await env.close()
|
|
227
|
+
|
|
228
|
+
# if __name__ == "__main__":
|
|
229
|
+
# if not os.getenv("HUD_API_KEY"): print("Set HUD_API_KEY")
|
|
230
|
+
# else: asyncio.run(main())
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
* **Pros:** Maximum control over the environment using specialized libraries. Useful if existing automation scripts use these tools.
|
|
235
|
+
* **Cons:** **Actions taken via the external library are NOT recorded in the HUD trajectory.** Only observations fetched via `env.step()` and the final `env.evaluate()` result are captured. Bypasses the `CLA` abstraction. Requires managing dependencies for the external control library.
|
|
236
|
+
|
|
237
|
+
Choose the approach that best fits your agent's design and your integration needs with the HUD framework's features like trajectory recording and standardized actions.
|