hud-python 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (105) hide show
  1. {hud_python-0.2.0 → hud_python-0.2.1}/PKG-INFO +19 -26
  2. {hud_python-0.2.0 → hud_python-0.2.1}/README.md +16 -23
  3. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/environment.mdx +3 -2
  4. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/task.mdx +32 -2
  5. {hud_python-0.2.0 → hud_python-0.2.1}/docs/docs.json +5 -4
  6. {hud_python-0.2.0 → hud_python-0.2.1}/docs/quickstart.mdx +3 -9
  7. hud_python-0.2.1/docs/running-your-agent.mdx +237 -0
  8. hud_python-0.2.1/examples/browser_use.ipynb +119 -0
  9. hud_python-0.2.1/examples/inspect.ipynb +169 -0
  10. {hud_python-0.2.0 → hud_python-0.2.1}/examples/jobs.ipynb +4 -6
  11. {hud_python-0.2.0 → hud_python-0.2.1}/examples/local.ipynb +7 -3
  12. {hud_python-0.2.0 → hud_python-0.2.1}/examples/osworld.ipynb +3 -3
  13. hud_python-0.2.1/examples/tasks.ipynb +257 -0
  14. {hud_python-0.2.0 → hud_python-0.2.1}/hud/__init__.py +1 -1
  15. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/claude/adapter.py +9 -1
  16. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/types.py +7 -0
  17. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/operator/adapter.py +4 -0
  18. {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/claude.py +22 -2
  19. {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/operator.py +35 -17
  20. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/docker_client.py +1 -1
  21. hud_python-0.2.1/hud/env/environment.py +354 -0
  22. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/local_docker_client.py +3 -1
  23. {hud_python-0.2.0 → hud_python-0.2.1}/hud/task.py +41 -30
  24. {hud_python-0.2.0 → hud_python-0.2.1}/hud/taskset.py +8 -0
  25. {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/common.py +28 -1
  26. hud_python-0.2.1/hud/utils/config.py +94 -0
  27. {hud_python-0.2.0 → hud_python-0.2.1}/pyproject.toml +3 -3
  28. {hud_python-0.2.0 → hud_python-0.2.1}/tests/test_import.py +1 -1
  29. hud_python-0.2.0/examples/browser_use.ipynb +0 -324
  30. hud_python-0.2.0/examples/tasks.ipynb +0 -117
  31. hud_python-0.2.0/hud/env/environment.py +0 -181
  32. hud_python-0.2.0/hud/utils/config.py +0 -185
  33. {hud_python-0.2.0 → hud_python-0.2.1}/.env.example +0 -0
  34. {hud_python-0.2.0 → hud_python-0.2.1}/.github/workflows/ci.yml +0 -0
  35. {hud_python-0.2.0 → hud_python-0.2.1}/.github/workflows/release.yml +0 -0
  36. {hud_python-0.2.0 → hud_python-0.2.1}/.gitignore +0 -0
  37. {hud_python-0.2.0 → hud_python-0.2.1}/LICENSE +0 -0
  38. {hud_python-0.2.0 → hud_python-0.2.1}/MANIFEST.in +0 -0
  39. {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/cla-details.mdx +0 -0
  40. {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/custom-environments.mdx +0 -0
  41. {hud_python-0.2.0 → hud_python-0.2.1}/docs/advanced/environment-control.mdx +0 -0
  42. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api/reference/adapters.mdx +0 -0
  43. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/adapters.mdx +0 -0
  44. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/env.mdx +0 -0
  45. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/gym.mdx +0 -0
  46. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/job.mdx +0 -0
  47. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/task.mdx +0 -0
  48. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/taskset.mdx +0 -0
  49. {hud_python-0.2.0 → hud_python-0.2.1}/docs/api-reference/trajectory.mdx +0 -0
  50. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/adapter.mdx +0 -0
  51. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/agent.mdx +0 -0
  52. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/job.mdx +0 -0
  53. {hud_python-0.2.0 → hud_python-0.2.1}/docs/concepts/trajectory.mdx +0 -0
  54. {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/basic.mdx +0 -0
  55. {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/claude-agent.mdx +0 -0
  56. {hud_python-0.2.0 → hud_python-0.2.1}/docs/examples/custom-agent.mdx +0 -0
  57. {hud_python-0.2.0 → hud_python-0.2.1}/docs/favicon.png +0 -0
  58. {hud_python-0.2.0 → hud_python-0.2.1}/docs/installation.mdx +0 -0
  59. {hud_python-0.2.0 → hud_python-0.2.1}/docs/logo/HUD-light-optimized.svg +0 -0
  60. {hud_python-0.2.0 → hud_python-0.2.1}/docs/logo/HUD.svg +0 -0
  61. {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/Dockerfile +0 -0
  62. {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/pyproject.toml +0 -0
  63. {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +0 -0
  64. {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +0 -0
  65. {hud_python-0.2.0 → hud_python-0.2.1}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -0
  66. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/Dockerfile +0 -0
  67. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/pyproject.toml +0 -0
  68. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
  69. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
  70. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
  71. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/info.py +0 -0
  72. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
  73. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
  74. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/step.py +0 -0
  75. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
  76. {hud_python-0.2.0 → hud_python-0.2.1}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
  77. {hud_python-0.2.0 → hud_python-0.2.1}/examples/README.md +0 -0
  78. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/__init__.py +0 -0
  79. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/claude/__init__.py +0 -0
  80. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/__init__.py +0 -0
  81. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/common/adapter.py +0 -0
  82. {hud_python-0.2.0 → hud_python-0.2.1}/hud/adapters/operator/__init__.py +0 -0
  83. {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/__init__.py +0 -0
  84. {hud_python-0.2.0 → hud_python-0.2.1}/hud/agent/base.py +0 -0
  85. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/__init__.py +0 -0
  86. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/client.py +0 -0
  87. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/remote_client.py +0 -0
  88. {hud_python-0.2.0 → hud_python-0.2.1}/hud/env/remote_docker_client.py +0 -0
  89. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/__init__.py +0 -0
  90. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/base.py +0 -0
  91. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/inspect.py +0 -0
  92. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/judge.py +0 -0
  93. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/match.py +0 -0
  94. {hud_python-0.2.0 → hud_python-0.2.1}/hud/evaluators/remote.py +0 -0
  95. {hud_python-0.2.0 → hud_python-0.2.1}/hud/gym.py +0 -0
  96. {hud_python-0.2.0 → hud_python-0.2.1}/hud/job.py +0 -0
  97. {hud_python-0.2.0 → hud_python-0.2.1}/hud/py.typed +0 -0
  98. {hud_python-0.2.0 → hud_python-0.2.1}/hud/server/__init__.py +0 -0
  99. {hud_python-0.2.0 → hud_python-0.2.1}/hud/server/requests.py +0 -0
  100. {hud_python-0.2.0 → hud_python-0.2.1}/hud/settings.py +0 -0
  101. {hud_python-0.2.0 → hud_python-0.2.1}/hud/trajectory.py +0 -0
  102. {hud_python-0.2.0 → hud_python-0.2.1}/hud/types.py +0 -0
  103. {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/__init__.py +0 -0
  104. {hud_python-0.2.0 → hud_python-0.2.1}/hud/utils/telemetry.py +0 -0
  105. {hud_python-0.2.0 → hud_python-0.2.1}/tests/__init__.py +0 -0
@@ -1,9 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: SDK for the HUD evaluation platform.
5
- Project-URL: Homepage, https://github.com/Human-Data/hud-sdk
6
- Project-URL: Bug Tracker, https://github.com/Human-Data/hud-sdk/issues
5
+ Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
+ Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
7
  Project-URL: Documentation, https://hud.so
8
8
  Author-email: Human Union Data SDK <founders@hud.so>
9
9
  License: MIT License
@@ -57,7 +57,7 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
57
57
  Requires-Dist: ruff==0.9.8; extra == 'dev'
58
58
  Description-Content-Type: text/markdown
59
59
 
60
- # HUD SDK - Human-Agent Interaction Toolkit
60
+ # HUD
61
61
 
62
62
  A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
63
63
 
@@ -86,21 +86,20 @@ export HUD_API_KEY=your_api_key_here
86
86
  pip install hud-python
87
87
  ```
88
88
 
89
- ### Simple Browser Example with Operator
89
+ ### Simple Browser Example with Claude Computer Use
90
90
 
91
91
  > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
92
92
 
93
+ Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
94
+
93
95
  ```python
94
- import os
95
96
  import asyncio
96
97
  from hud import gym, job
97
98
  from hud.task import Task
98
- from hud.utils import stream
99
- from hud.agent import OperatorAgent
99
+ from hud.agent import ClaudeAgent
100
100
 
101
101
  @job("test-run")
102
102
  async def main():
103
- # Define a simple task
104
103
  task = Task(
105
104
  prompt="Insert the text 'capybara' into the search bar",
106
105
  gym="hud-browser",
@@ -108,26 +107,20 @@ async def main():
108
107
  evaluate=("contains_text", "capybara")
109
108
  )
110
109
 
111
- # Create environment
110
+ # Create environment using the gym module
112
111
  env = await gym.make(task)
113
112
 
114
- # Get URLs and display live view (optional)
115
- # urls = await env.get_urls()
116
- # stream(urls["live_url"])
117
-
118
113
  # Initialize Operator agent (API key is loaded automatically)
119
- agent = OperatorAgent()
114
+ agent = ClaudeAgent()
120
115
 
121
- # Agent loop
122
- obs, _ = env.reset()
116
+ # Agent loop with predict and step functions
117
+ obs, _ = await env.reset() # Gets first observation
123
118
  for i in range(5):
124
119
  actions, done = await agent.predict(obs)
125
120
  if done:
126
121
  break
127
122
 
128
123
  obs, reward, terminated, info = await env.step(actions)
129
- if terminated:
130
- break
131
124
 
132
125
  # Evaluate and close
133
126
  result = await env.evaluate()
@@ -143,26 +136,26 @@ if __name__ == "__main__":
143
136
 
144
137
  Explore the core concepts and features of the SDK:
145
138
 
146
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
139
+ * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
147
140
  * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
148
141
  * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
149
142
  * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
150
143
  * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
151
144
  * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
152
145
  * **Advanced Topics**:
146
+ * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
153
147
  * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
154
148
  * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
155
- * **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
156
149
 
157
150
  * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
158
151
 
159
152
  ## [Examples](examples/)
160
153
 
161
- We provide several example notebooks showing how to use the HUD SDK:
154
+ We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
162
155
 
163
156
  1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
164
157
  2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
165
- 3. [OSWorld](examples/osworld.ipynb) - Working with OS environments
158
+ 3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
166
159
  4. [Local Development](examples/local.ipynb) - Setting up local custom environments
167
160
 
168
161
  ## Documentation
@@ -180,9 +173,9 @@ If you use this SDK in your research, please cite it as follows:
180
173
  ```bibtex
181
174
  @software{hud2025agentevalplatform,
182
175
  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
183
- title = {{HUD: An Evaluation Platform for Computer Use Agents}},
184
- date = {2025-03},
185
- url = {https://github.com/Human-Data/hud-sdk},
176
+ title = {{HUD: An Evaluation Platform for Agents}},
177
+ date = {2025-04},
178
+ url = {https://github.com/hud-evals/hud-sdk},
186
179
  langid = {en}
187
180
  }
188
181
  ```
@@ -1,4 +1,4 @@
1
- # HUD SDK - Human-Agent Interaction Toolkit
1
+ # HUD
2
2
 
3
3
  A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
4
4
 
@@ -27,21 +27,20 @@ export HUD_API_KEY=your_api_key_here
27
27
  pip install hud-python
28
28
  ```
29
29
 
30
- ### Simple Browser Example with Operator
30
+ ### Simple Browser Example with Claude Computer Use
31
31
 
32
32
  > This example uses the `@job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
33
33
 
34
+ Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
35
+
34
36
  ```python
35
- import os
36
37
  import asyncio
37
38
  from hud import gym, job
38
39
  from hud.task import Task
39
- from hud.utils import stream
40
- from hud.agent import OperatorAgent
40
+ from hud.agent import ClaudeAgent
41
41
 
42
42
  @job("test-run")
43
43
  async def main():
44
- # Define a simple task
45
44
  task = Task(
46
45
  prompt="Insert the text 'capybara' into the search bar",
47
46
  gym="hud-browser",
@@ -49,26 +48,20 @@ async def main():
49
48
  evaluate=("contains_text", "capybara")
50
49
  )
51
50
 
52
- # Create environment
51
+ # Create environment using the gym module
53
52
  env = await gym.make(task)
54
53
 
55
- # Get URLs and display live view (optional)
56
- # urls = await env.get_urls()
57
- # stream(urls["live_url"])
58
-
59
54
  # Initialize Operator agent (API key is loaded automatically)
60
- agent = OperatorAgent()
55
+ agent = ClaudeAgent()
61
56
 
62
- # Agent loop
63
- obs, _ = env.reset()
57
+ # Agent loop with predict and step functions
58
+ obs, _ = await env.reset() # Gets first observation
64
59
  for i in range(5):
65
60
  actions, done = await agent.predict(obs)
66
61
  if done:
67
62
  break
68
63
 
69
64
  obs, reward, terminated, info = await env.step(actions)
70
- if terminated:
71
- break
72
65
 
73
66
  # Evaluate and close
74
67
  result = await env.evaluate()
@@ -84,26 +77,26 @@ if __name__ == "__main__":
84
77
 
85
78
  Explore the core concepts and features of the SDK:
86
79
 
87
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios.
80
+ * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
88
81
  * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
89
82
  * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
90
83
  * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
91
84
  * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
92
85
  * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
93
86
  * **Advanced Topics**:
87
+ * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
94
88
  * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
95
89
  * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
96
- * **[CLA Action Details](/advanced/cla-details)**: Dive deeper into the standardized action format.
97
90
 
98
91
  * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
99
92
 
100
93
  ## [Examples](examples/)
101
94
 
102
- We provide several example notebooks showing how to use the HUD SDK:
95
+ We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
103
96
 
104
97
  1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
105
98
  2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
106
- 3. [OSWorld](examples/osworld.ipynb) - Working with OS environments
99
+ 3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
107
100
  4. [Local Development](examples/local.ipynb) - Setting up local custom environments
108
101
 
109
102
  ## Documentation
@@ -121,9 +114,9 @@ If you use this SDK in your research, please cite it as follows:
121
114
  ```bibtex
122
115
  @software{hud2025agentevalplatform,
123
116
  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Max Muoto and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
124
- title = {{HUD: An Evaluation Platform for Computer Use Agents}},
125
- date = {2025-03},
126
- url = {https://github.com/Human-Data/hud-sdk},
117
+ title = {{HUD: An Evaluation Platform for Agents}},
118
+ date = {2025-04},
119
+ url = {https://github.com/hud-evals/hud-sdk},
127
120
  langid = {en}
128
121
  }
129
122
  ```
@@ -67,13 +67,14 @@ obs, _ = await env.reset()
67
67
  for _ in range(10):
68
68
  # 2. Agent predicts action(s)
69
69
  actions, done = await agent.predict(obs)
70
- if done: break
71
70
 
72
71
  # 3. Execute action(s) in environment
73
72
  obs, reward, terminated, info = await env.step(actions)
74
- if terminated: break
73
+ if done or terminated: break
75
74
  ```
76
75
 
76
+ * **Note on QA Tasks:** For [Question-Answering Tasks](/concepts/task#defining-question-answering-qa-tasks), the agent might only need one `predict` call. The agent should output a `ResponseAction`, which the environment stores. The subsequent `env.evaluate()` call then checks this stored response. The environment itself remains largely passive for QA.
77
+
77
78
  ## Key Methods
78
79
 
79
80
  * **`env.step(actions: list[CLA] | None = None)`**: Executes actions (or gets initial state). Returns `(Observation, reward, terminated, info)`.
@@ -60,7 +60,10 @@ Both `setup` and `evaluate` accept configurations defining function calls within
60
60
  * **Purpose:** Determines task success after the agent finishes.
61
61
  * **Execution:** Triggered by `await env.evaluate()`.
62
62
  * **Result:** The return value of `env.evaluate()`, often a reward score (e.g., `1.0` or `0.0`). This is stored in the `reward` field of the [Trajectory](/concepts/trajectory) if linked to a [Job](/concepts/job).
63
- * **Examples:** `("contains_text", "Success!")`, `("file_exists", "/path/to/output.txt")`. Check specific environment controller docs for available functions.
63
+ * **Examples:**
64
+ * Interactive: `("contains_text", "Success!")`, `("file_exists", "/path/to/output.txt")`. These typically call functions *within* the active environment controller.
65
+ * QA: `("response_includes", "Paris")`. These functions often check the text stored in `env.final_response` (which comes from the agent's `ResponseAction`).
66
+ * **Note:** Check specific environment or evaluation service documentation for available functions.
64
67
 
65
68
  ## TaskSet
66
69
 
@@ -99,4 +102,31 @@ my_taskset = TaskSet(tasks=[task1, task2], description="My set")
99
102
  * [Environment](/concepts/environment): Where Tasks are executed and evaluated.
100
103
  * [Agent](/concepts/agent): Aims to complete the Task `prompt`.
101
104
  * [Job](/concepts/job): Groups runs of different Tasks.
102
- * [Trajectory](/concepts/trajectory): Records the execution of a Task.
105
+ * [Trajectory](/concepts/trajectory): Records the execution of a Task.
106
+
107
+ ### Defining Question-Answering (QA) Tasks
108
+
109
+ While HUD excels at interactive tasks, you can also define tasks that are primarily question-answering. The key differences are:
110
+
111
+ * **`gym`:** You might still use an existing environment type like `"hud-browser"` if you want the QA to happen *within* that context (e.g., asking the agent to answer based on a webpage). For pure QA without environment interaction, a future specific `"qa"` gym type might be introduced, but currently, you'd use an existing type.
112
+ * **`prompt`:** Contains the question for the agent.
113
+ * **`setup`:** Often minimal or unnecessary for pure QA.
114
+ * **`evaluate`:** Defines how to check the agent's final text answer. This typically involves calling a specific evaluation function that compares the agent's final submitted response (see `ResponseAction` in [CLA Details](/advanced/cla-details)) against expected criteria. The `env.final_response` attribute holds the text submitted by the agent via `ResponseAction`.
115
+ * **`target`:** (Recommended) Store the ground truth answer in the `metadata` or potentially a dedicated `target` field for clarity during evaluation function design.
116
+
117
+ ```python
118
+ from hud.task import Task
119
+
120
+ qa_task = Task(
121
+ prompt="What is the powerhouse of the cell?",
122
+ gym="hud-browser", # Or potentially a future "qa" type
123
+ # No complex setup needed for pure QA
124
+ setup=(),
125
+ # Evaluation checks the agent's final submitted text response
126
+ evaluate=("response_includes", "mitochondria"), # Assumes a function checking env.final_response
127
+ )
128
+ ```
129
+
130
+ The [Agent](/concepts/agent) handling such a task should recognize it doesn't need complex interaction and output a `ResponseAction` containing the final answer. The `env.evaluate()` call then triggers the specified check (like `response_includes`) against the stored response.
131
+
132
+ ### <a name="configuration-styles"></a>Configuration Styles (`setup` and `evaluate`)
@@ -14,6 +14,7 @@
14
14
  "group": "Getting Started",
15
15
  "pages": [
16
16
  "quickstart",
17
+ "running-your-agent",
17
18
  "installation"
18
19
  ]
19
20
  },
@@ -31,9 +32,9 @@
31
32
  {
32
33
  "group": "Advanced Topics",
33
34
  "pages": [
35
+ "advanced/cla-details",
34
36
  "advanced/custom-environments",
35
- "advanced/environment-control",
36
- "advanced/cla-details"
37
+ "advanced/environment-control"
37
38
  ]
38
39
  },
39
40
  {
@@ -59,13 +60,13 @@
59
60
  "links": [
60
61
  {
61
62
  "label": "GitHub",
62
- "href": "https://github.com/Human-Data/hud-sdk"
63
+ "href": "https://github.com/hud-evals/hud-sdk"
63
64
  }
64
65
  ]
65
66
  },
66
67
  "footer": {
67
68
  "socials": {
68
- "github": "https://github.com/Human-Data/hud-sdk",
69
+ "github": "https://github.com/hud-evals/hud-sdk",
69
70
  "website": "https://hud.so"
70
71
  }
71
72
  }
@@ -31,7 +31,7 @@ The SDK automatically loads API keys from environment variables or a `.env` file
31
31
 
32
32
  Example `.env` file:
33
33
  ```
34
- HUD_API_KEY=hud_...
34
+ HUD_API_KEY=sk-hud-...
35
35
  OPENAI_API_KEY=sk-...
36
36
  # ANTHROPIC_API_KEY=sk-ant-...
37
37
  ```
@@ -79,16 +79,10 @@ async def main():
79
79
  actions, done = await agent.predict(obs)
80
80
  print(f"Agent action(s): {actions}")
81
81
 
82
- if done:
83
- print("Agent signaled task completion.")
84
- break
85
-
86
82
  # Execute the action(s) in the environment
87
83
  obs, reward, terminated, info = await env.step(actions)
88
84
 
89
- if terminated:
90
- print("Environment terminated.")
91
- break
85
+ if done or terminated: break # Agent signaled task completion or environment terminated
92
86
 
93
87
  # 5. Evaluate & Close
94
88
  print("Evaluating task...")
@@ -127,4 +121,4 @@ if __name__ == "__main__":
127
121
 
128
122
  * Explore the [Core Concepts](/concepts/environment) to understand the SDK architecture in more detail.
129
123
  * Check out the [Examples folder in the GitHub repo](/examples/) for more detailed, runnable notebooks covering different agents and environments.
130
- * Review the [API Reference](/api-reference/gym) for comprehensive documentation on specific functions and classes.
124
+ * Review the [API Reference](/api-reference/gym) for comprehensive documentation on specific functions and classes.
@@ -0,0 +1,237 @@
1
+ ---
2
+ title: 'Running Your Own Agent'
3
+ description: 'Integrating custom agent logic with HUD environments'
4
+ ---
5
+
6
+ # Running Your Own Agent
7
+
8
+ The HUD SDK is designed to be flexible, allowing you to integrate various types of AI agents. While the SDK provides built-in agents (`ClaudeAgent`, `OperatorAgent`), you can easily run your own custom agent logic. This guide outlines the primary ways to achieve this.
9
+
10
+ The core interaction loop with any HUD [Environment](/concepts/environment) involves:
11
+ 1. Creating the environment: `env = await hud.gym.make(...)`
12
+ 2. Getting an initial observation: `obs, _ = await env.reset()`
13
+ 3. **Agent Decision:** Processing `obs` to decide on the next action(s).
14
+ 4. Executing actions: `obs, _, _, _ = await env.step(actions)`
15
+ 5. Evaluating the outcome: `result = await env.evaluate()`
16
+ 6. Closing the environment: `await env.close()`
17
+
18
+ The key difference lies in how **Step 3 (Agent Decision)** is implemented and how the resulting `actions` are formatted for **Step 4**.
19
+
20
+ ## Approach 1: Direct CLA Interaction
21
+
22
+ This is the most straightforward approach if your agent logic can directly generate actions conforming to HUD's **Common Language Actions (CLA)** format. See [CLA Action Details](/advanced/cla-details) for format specifics.
23
+
24
+ * **Concept:** Your agent code, running outside the HUD `Agent` class structure, processes the `Observation` and directly constructs a list of `CLA` objects.
25
+ * **Implementation:**
26
+ * Focus on your agent's decision-making process based on `obs.screenshot` and `obs.text`.
27
+ * Your agent's output must be `list[CLA]`. You'll need to import specific `CLA` types (like `ClickAction`, `TypeAction`, etc.) from `hud.adapters.common.types`.
28
+ * Pass this list directly to `env.step()`.
29
+
30
+ ```python
31
+ import asyncio
32
+ from hud import gym, job
33
+ from hud.task import Task
34
+ from hud.env import Observation
35
+ # Import specific CLA types you need
36
+ from hud.adapters import CLA
37
+ from hud.adapters.common.types import ClickAction, TypeAction, Point
38
+
39
+ # --- Your Custom Agent Logic ---
40
+ def my_custom_agent_logic(observation: Observation) -> list[CLA]:
41
+ # Process screenshot/text...
42
+ # Decide on next actions...
43
+ # Example: Click at (100, 150) and type "hello"
44
+ actions = [
45
+ ClickAction(point=Point(x=100, y=150)),
46
+ TypeAction(text="hello")
47
+ ]
48
+ # Ensure the return type is list[CLA]
49
+ return actions
50
+
51
+ @job("custom-cla-agent-run")
52
+ async def main():
53
+ task = Task(prompt="Click and type", gym="hud-browser")
54
+ env = await gym.make(task)
55
+ obs, _ = await env.reset() # Initial observation
56
+
57
+ for i in range(5):
58
+ print(f"--- Step {i+1} ---")
59
+ # Get actions directly from your logic
60
+ agent_actions: list[CLA] = my_custom_agent_logic(obs)
61
+ print(f"Agent actions: {agent_actions}")
62
+
63
+ # Step the environment with CLA actions
64
+ obs, _, _, terminated, info = await env.step(agent_actions)
65
+
66
+ if terminated: break # Check termination
67
+
68
+ result = await env.evaluate()
69
+ print(f"Evaluation: {result}")
70
+ await env.close()
71
+
72
+ # if __name__ == "__main__":
73
+ # asyncio.run(main())
74
+ ```
75
+
76
+ * **Pros:** Simple integration, doesn't require understanding the `Agent`/`Adapter` inheritance structure.
77
+ * **Cons:** Your agent logic needs to be aware of and construct specific `CLA` Pydantic models. No automatic observation preprocessing (like screenshot rescaling) or action postprocessing (like coordinate rescaling) provided by the `Adapter` framework.
78
+
79
+ ## Approach 2: Inheriting `hud.agent.Agent`
80
+
81
+ This approach leverages the SDK's structure for a more integrated solution.
82
+
83
+ * **Concept:** Create a class that inherits from `hud.agent.Agent`. Implement the `fetch_response` method to contain your core agent logic (calling your model, processing results). Optionally, create a custom `hud.adapters.Adapter` if your model uses a non-standard action format or requires specific observation rescaling.
84
+ * **Implementation:**
85
+ * Define `MyAgent(Agent[MyClientType, MyRawActionType])`.
86
+ * Implement `async def fetch_response(self, observation: Observation) -> tuple[list[MyRawActionType], bool]: ...`. This method should return the *raw* actions from your model and a `done` flag.
87
+ * (Optional) Define `MyAdapter(Adapter)` and implement `convert(self, raw_action: MyRawActionType) -> CLA: ...`. You might also override `__init__` to set `self.agent_width`/`height` if different from the default.
88
+ * Instantiate your agent, optionally passing your custom adapter: `agent = MyAgent(client=my_llm_client, adapter=MyAdapter())`. If you provide an adapter, the base `Agent.predict` method will automatically call `adapter.rescale` before `fetch_response` and `adapter.adapt_list` after.
89
+
90
+ ```python
91
+ import asyncio
92
+ from typing import Any # Placeholder for your raw action type
93
+ from hud import gym, job
94
+ from hud.task import Task
95
+ from hud.env import Observation
96
+ from hud.agent import Agent # Import base class
97
+ from hud.adapters import Adapter, CLA # Import base adapter and CLA type
98
+ # Import your specific CLA types if needed for a custom adapter
99
+ from hud.adapters.common.types import ClickAction, TypeAction, Point
100
+
101
+ # --- Your Custom Agent ---
102
+ class MyRawAction(dict): # Example raw action type (e.g., a dictionary)
103
+ pass
104
+
105
+ class MyAgent(Agent[Any, MyRawAction]): # Specify client type and raw action type
106
+ # You might initialize your LLM client here
107
+ def __init__(self, adapter: Adapter | None = None): # Optionally take an adapter
108
+ super().__init__(client=None, adapter=adapter) # Pass adapter to base
109
+
110
+ async def fetch_response(self, observation: Observation) -> tuple[list[MyRawAction], bool]:
111
+ # 1. Process observation (screenshot already rescaled if adapter exists)
112
+ prompt = f"Image received. Task: {observation.text}. What to do?"
113
+ # 2. Call your custom LLM / logic
114
+ # llm_response = await my_llm_call(prompt, observation.screenshot)
115
+ llm_response = {"action_type": "click", "x": 200, "y": 250} # Dummy response
116
+
117
+ # 3. Convert LLM response to your raw action format
118
+ raw_actions: list[MyRawAction] = [MyRawAction(llm_response)] # Example
119
+ done = False # Decide if task is done
120
+ return raw_actions, done
121
+
122
+ # --- (Optional) Your Custom Adapter ---
123
+ class MyAdapter(Adapter):
124
+ def __init__(self):
125
+ super().__init__()
126
+ self.agent_width = 1000 # Example: If your model expects 1000px wide images
127
+ self.agent_height = 800
128
+
129
+ def convert(self, raw_action: MyRawAction) -> CLA:
130
+ # Convert your raw action dict to a CLA Pydantic model
131
+ if raw_action.get("action_type") == "click":
132
+ return ClickAction(point=Point(x=raw_action["x"], y=raw_action["y"]))
133
+ elif raw_action.get("action_type") == "type":
134
+ return TypeAction(text=raw_action.get("text", ""))
135
+ # ... handle other action types ...
136
+ raise ValueError(f"Unknown raw action type: {raw_action}")
137
+
138
+ # --- Usage ---
139
+ @job("custom-agent-framework-run")
140
+ async def main():
141
+ task = Task(prompt="Use custom agent", gym="hud-browser")
142
+ env = await gym.make(task)
143
+
144
+ # Initialize agent, optionally with the adapter
145
+ my_agent = MyAgent(adapter=MyAdapter()) # Adapter handles conversion + rescaling
146
+
147
+ obs, _ = await env.reset()
148
+ for i in range(5):
149
+ print(f"--- Step {i+1} ---")
150
+ # Predict handles preprocess, fetch_response, postprocess
151
+ processed_actions, done = await my_agent.predict(obs)
152
+ print(f"Processed CLA actions: {processed_actions}")
153
+
154
+ if done: break
155
+ obs, _, _, terminated, info = await env.step(processed_actions)
156
+ if terminated: break
157
+
158
+ result = await env.evaluate()
159
+ print(f"Evaluation: {result}")
160
+ await env.close()
161
+
162
+ # if __name__ == "__main__":
163
+ # asyncio.run(main())
164
+ ```
165
+
166
+ * **Pros:** Leverages SDK structure, benefits from automatic rescaling (if adapter used), cleaner separation of agent logic (`fetch_response`) and action conversion (`Adapter`).
167
+ * **Cons:** Requires understanding the `Agent` and `Adapter` base classes.
168
+
169
+ ## Approach 3: External Control (e.g., CDP)
170
+
171
+ This approach uses HUD primarily for environment provisioning and lifecycle management, while the core interaction happens via a direct connection using protocols like CDP.
172
+
173
+ * **Concept:** Use `gym.make()` to start an environment (e.g., `"hud-browser"`). Use `env.get_urls()` to retrieve connection details (like a CDP endpoint URL). Use an external library (`pyppeteer`, `playwright`, `selenium` with CDP) to connect directly to the browser instance and control it using that library's commands.
174
+ * **Implementation:**
175
+ * Create the HUD environment: `env = await gym.make(...)`.
176
+ * Get connection info: `urls = await env.get_urls()`, `cdp_url = urls['url']`.
177
+ * Initialize your external library (e.g., `pyppeteer.connect(browserURL=cdp_url)`).
178
+ * Use the external library's functions for interaction (e.g., `page.click()`, `page.type()`). You would likely still use `env.step()` *without actions* periodically to get updated `Observation` (screenshots) for your agent's decision-making, but you wouldn't pass actions *back* to `env.step()`.
179
+ * When finished, call `await env.evaluate()` and `await env.close()` on the HUD `env` object.
180
+
181
+ ```python
182
+ import asyncio
183
+ import os
184
+ from hud import gym, job
185
+ from hud.task import Task
186
+ from hud.utils import stream # For live view
187
+ # Need external library, e.g., pyppeteer (pip install pyppeteer)
188
+ # import pyppeteer
189
+
190
+ @job("external-control-run")
191
+ async def main():
192
+ task = Task(prompt="Externally controlled task", gym="hud-browser", setup=("goto", "google.com"))
193
+ env = await gym.make(task)
194
+
195
+ try:
196
+ urls = await env.get_urls()
197
+ cdp_url = urls.get('url')
198
+ live_url = urls.get('live_url')
199
+
200
+ if not cdp_url:
201
+ raise ConnectionError("Could not get CDP URL from environment.")
202
+
203
+ if live_url:
204
+ stream(live_url) # Show live view
205
+
206
+ print(f"Connecting via CDP: {cdp_url}")
207
+ # --- Connect using external library (Example: pyppeteer) ---
208
+ # browser = await pyppeteer.connect(browserURL=cdp_url)
209
+ # page = (await browser.pages())[0] # Assume first page
210
+
211
+ print("Performing actions via external library (e.g., pyppeteer)...")
212
+ # await page.waitForSelector('textarea[name="q"]', {'visible': True})
213
+ # await page.type('textarea[name="q"]', 'capybara')
214
+ # await page.keyboard.press('Enter')
215
+ # await asyncio.sleep(2) # Wait for results
216
+
217
+ # --- End external library interaction ---
218
+ # await browser.disconnect()
219
+
220
+ print("Evaluating task via env.evaluate()...")
221
+ result = await env.evaluate(("contains_text", "capybara")) # Example eval
222
+ print(f"Evaluation result: {result}")
223
+
224
+ finally:
225
+ print("Closing environment...")
226
+ await env.close()
227
+
228
+ # if __name__ == "__main__":
229
+ # if not os.getenv("HUD_API_KEY"): print("Set HUD_API_KEY")
230
+ # else: asyncio.run(main())
231
+
232
+ ```
233
+
234
+ * **Pros:** Maximum control over the environment using specialized libraries. Useful if existing automation scripts use these tools.
235
+ * **Cons:** **Actions taken via the external library are NOT recorded in the HUD trajectory.** Only observations fetched via `env.step()` and the final `env.evaluate()` result are captured. Bypasses the `CLA` abstraction. Requires managing dependencies for the external control library.
236
+
237
+ Choose the approach that best fits your agent's design and your integration needs with the HUD framework's features like trajectory recording and standardized actions.