hud-python 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (110) hide show
  1. {hud_python-0.2.1 → hud_python-0.2.2}/PKG-INFO +36 -16
  2. {hud_python-0.2.1 → hud_python-0.2.2}/README.md +28 -14
  3. {hud_python-0.2.1 → hud_python-0.2.2}/docs/advanced/cla-details.mdx +5 -0
  4. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/environment.mdx +14 -0
  5. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/task.mdx +5 -4
  6. {hud_python-0.2.1 → hud_python-0.2.2}/docs/docs.json +8 -0
  7. hud_python-0.2.2/docs/environments/hud-browser.mdx +67 -0
  8. hud_python-0.2.2/docs/environments/hud-ubuntu.mdx +55 -0
  9. hud_python-0.2.2/docs/environments/qa.mdx +68 -0
  10. {hud_python-0.2.1 → hud_python-0.2.2}/docs/quickstart.mdx +1 -5
  11. hud_python-0.2.2/examples/WebVoyager_data.jsonl +643 -0
  12. {hud_python-0.2.1 → hud_python-0.2.2}/examples/browser_use.ipynb +1 -0
  13. hud_python-0.2.2/examples/ds_upload.ipynb +2313 -0
  14. hud_python-0.2.2/examples/inspect.ipynb +2091 -0
  15. {hud_python-0.2.1 → hud_python-0.2.2}/examples/local.ipynb +4 -12
  16. {hud_python-0.2.1 → hud_python-0.2.2}/examples/tasks.ipynb +11 -9
  17. {hud_python-0.2.1 → hud_python-0.2.2}/hud/__init__.py +3 -2
  18. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/__init__.py +2 -1
  19. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/claude/adapter.py +9 -4
  20. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/common/types.py +0 -3
  21. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/operator/adapter.py +6 -6
  22. {hud_python-0.2.1 → hud_python-0.2.2}/hud/agent/__init__.py +2 -1
  23. hud_python-0.2.2/hud/agent/langchain.py +198 -0
  24. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/remote_client.py +4 -0
  25. {hud_python-0.2.1 → hud_python-0.2.2}/hud/gym.py +3 -3
  26. hud_python-0.2.2/hud/job.py +593 -0
  27. {hud_python-0.2.1 → hud_python-0.2.2}/hud/task.py +3 -3
  28. {hud_python-0.2.1 → hud_python-0.2.2}/hud/types.py +5 -3
  29. {hud_python-0.2.1 → hud_python-0.2.2}/hud/utils/common.py +4 -1
  30. {hud_python-0.2.1 → hud_python-0.2.2}/hud/utils/config.py +1 -1
  31. hud_python-0.2.2/hud/utils/progress.py +136 -0
  32. {hud_python-0.2.1 → hud_python-0.2.2}/pyproject.toml +8 -2
  33. {hud_python-0.2.1 → hud_python-0.2.2}/tests/test_import.py +1 -1
  34. hud_python-0.2.1/examples/inspect.ipynb +0 -169
  35. hud_python-0.2.1/hud/job.py +0 -185
  36. {hud_python-0.2.1 → hud_python-0.2.2}/.env.example +0 -0
  37. {hud_python-0.2.1 → hud_python-0.2.2}/.github/workflows/ci.yml +0 -0
  38. {hud_python-0.2.1 → hud_python-0.2.2}/.github/workflows/release.yml +0 -0
  39. {hud_python-0.2.1 → hud_python-0.2.2}/.gitignore +0 -0
  40. {hud_python-0.2.1 → hud_python-0.2.2}/LICENSE +0 -0
  41. {hud_python-0.2.1 → hud_python-0.2.2}/MANIFEST.in +0 -0
  42. {hud_python-0.2.1 → hud_python-0.2.2}/docs/advanced/custom-environments.mdx +0 -0
  43. {hud_python-0.2.1 → hud_python-0.2.2}/docs/advanced/environment-control.mdx +0 -0
  44. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api/reference/adapters.mdx +0 -0
  45. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/adapters.mdx +0 -0
  46. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/env.mdx +0 -0
  47. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/gym.mdx +0 -0
  48. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/job.mdx +0 -0
  49. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/task.mdx +0 -0
  50. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/taskset.mdx +0 -0
  51. {hud_python-0.2.1 → hud_python-0.2.2}/docs/api-reference/trajectory.mdx +0 -0
  52. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/adapter.mdx +0 -0
  53. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/agent.mdx +0 -0
  54. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/job.mdx +0 -0
  55. {hud_python-0.2.1 → hud_python-0.2.2}/docs/concepts/trajectory.mdx +0 -0
  56. {hud_python-0.2.1 → hud_python-0.2.2}/docs/examples/basic.mdx +0 -0
  57. {hud_python-0.2.1 → hud_python-0.2.2}/docs/examples/claude-agent.mdx +0 -0
  58. {hud_python-0.2.1 → hud_python-0.2.2}/docs/examples/custom-agent.mdx +0 -0
  59. {hud_python-0.2.1 → hud_python-0.2.2}/docs/favicon.png +0 -0
  60. {hud_python-0.2.1 → hud_python-0.2.2}/docs/installation.mdx +0 -0
  61. {hud_python-0.2.1 → hud_python-0.2.2}/docs/logo/HUD-light-optimized.svg +0 -0
  62. {hud_python-0.2.1 → hud_python-0.2.2}/docs/logo/HUD.svg +0 -0
  63. {hud_python-0.2.1 → hud_python-0.2.2}/docs/running-your-agent.mdx +0 -0
  64. {hud_python-0.2.1 → hud_python-0.2.2}/environments/novnc_ubuntu/Dockerfile +0 -0
  65. {hud_python-0.2.1 → hud_python-0.2.2}/environments/novnc_ubuntu/pyproject.toml +0 -0
  66. {hud_python-0.2.1 → hud_python-0.2.2}/environments/novnc_ubuntu/src/novnc_ubuntu/__init__.py +0 -0
  67. {hud_python-0.2.1 → hud_python-0.2.2}/environments/novnc_ubuntu/src/novnc_ubuntu/pyautogui_rosetta.py +0 -0
  68. {hud_python-0.2.1 → hud_python-0.2.2}/environments/novnc_ubuntu/src/novnc_ubuntu/step.py +0 -0
  69. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/Dockerfile +0 -0
  70. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/pyproject.toml +0 -0
  71. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/__init__.py +0 -0
  72. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/evaluate/__init__.py +0 -0
  73. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/evaluate/matchers.py +0 -0
  74. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/info.py +0 -0
  75. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/setup/__init__.py +0 -0
  76. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/setup/question.py +0 -0
  77. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/step.py +0 -0
  78. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/utils/__init__.py +0 -0
  79. {hud_python-0.2.1 → hud_python-0.2.2}/environments/qa_controller/src/qa_controller/utils/state.py +0 -0
  80. {hud_python-0.2.1 → hud_python-0.2.2}/examples/README.md +0 -0
  81. {hud_python-0.2.1 → hud_python-0.2.2}/examples/jobs.ipynb +0 -0
  82. {hud_python-0.2.1 → hud_python-0.2.2}/examples/osworld.ipynb +0 -0
  83. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/claude/__init__.py +0 -0
  84. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/common/__init__.py +0 -0
  85. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/common/adapter.py +0 -0
  86. {hud_python-0.2.1 → hud_python-0.2.2}/hud/adapters/operator/__init__.py +0 -0
  87. {hud_python-0.2.1 → hud_python-0.2.2}/hud/agent/base.py +0 -0
  88. {hud_python-0.2.1 → hud_python-0.2.2}/hud/agent/claude.py +0 -0
  89. {hud_python-0.2.1 → hud_python-0.2.2}/hud/agent/operator.py +0 -0
  90. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/__init__.py +0 -0
  91. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/client.py +0 -0
  92. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/docker_client.py +0 -0
  93. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/environment.py +0 -0
  94. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/local_docker_client.py +0 -0
  95. {hud_python-0.2.1 → hud_python-0.2.2}/hud/env/remote_docker_client.py +0 -0
  96. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/__init__.py +0 -0
  97. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/base.py +0 -0
  98. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/inspect.py +0 -0
  99. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/judge.py +0 -0
  100. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/match.py +0 -0
  101. {hud_python-0.2.1 → hud_python-0.2.2}/hud/evaluators/remote.py +0 -0
  102. {hud_python-0.2.1 → hud_python-0.2.2}/hud/py.typed +0 -0
  103. {hud_python-0.2.1 → hud_python-0.2.2}/hud/server/__init__.py +0 -0
  104. {hud_python-0.2.1 → hud_python-0.2.2}/hud/server/requests.py +0 -0
  105. {hud_python-0.2.1 → hud_python-0.2.2}/hud/settings.py +0 -0
  106. {hud_python-0.2.1 → hud_python-0.2.2}/hud/taskset.py +0 -0
  107. {hud_python-0.2.1 → hud_python-0.2.2}/hud/trajectory.py +0 -0
  108. {hud_python-0.2.1 → hud_python-0.2.2}/hud/utils/__init__.py +0 -0
  109. {hud_python-0.2.1 → hud_python-0.2.2}/hud/utils/telemetry.py +0 -0
  110. {hud_python-0.2.1 → hud_python-0.2.2}/tests/__init__.py +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
7
7
  Project-URL: Documentation, https://hud.so
8
- Author-email: Human Union Data SDK <founders@hud.so>
8
+ Author-email: HUD SDK <founders@hud.so>
9
9
  License: MIT License
10
10
 
11
11
  Copyright (c) 2025 Human Union Data, Inc
@@ -37,8 +37,14 @@ Classifier: Programming Language :: Python :: 3.12
37
37
  Classifier: Programming Language :: Python :: 3.13
38
38
  Requires-Python: <3.14,>=3.10
39
39
  Requires-Dist: aiodocker>=0.24.0
40
+ Requires-Dist: anthropic
40
41
  Requires-Dist: httpx<1,>=0.23.0
41
42
  Requires-Dist: inspect-ai>=0.3.80
43
+ Requires-Dist: ipykernel
44
+ Requires-Dist: langchain
45
+ Requires-Dist: langchain-openai
46
+ Requires-Dist: numpy
47
+ Requires-Dist: openai
42
48
  Requires-Dist: pillow>=11.1.0
43
49
  Requires-Dist: pydantic-settings<3,>=2
44
50
  Requires-Dist: pydantic<3,>=2
@@ -117,10 +123,9 @@ async def main():
117
123
  obs, _ = await env.reset() # Gets first observation
118
124
  for i in range(5):
119
125
  actions, done = await agent.predict(obs)
120
- if done:
121
- break
122
-
126
+
123
127
  obs, reward, terminated, info = await env.step(actions)
128
+ if done or terminated: break
124
129
 
125
130
  # Evaluate and close
126
131
  result = await env.evaluate()
@@ -132,22 +137,37 @@ if __name__ == "__main__":
132
137
 
133
138
  ```
134
139
 
140
+ Alternatively, run a full evaluation set via the ```run_job``` command:
141
+
142
+ ```python
143
+ from hud import load_taskset, run_job, ClaudeAgent
144
+
145
+ # load
146
+ taskset = load_taskset("GAIA")
147
+
148
+ # evaluate
149
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
150
+
151
+ # get results OR view them in app.hud.so
152
+ print(await job.get_analytics())
153
+ ```
154
+
135
155
  ## Documentation Sections
136
156
 
137
157
  Explore the core concepts and features of the SDK:
138
158
 
139
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
140
- * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
141
- * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
142
- * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
143
- * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
144
- * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
159
+ * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
160
+ * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
161
+ * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
162
+ * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
163
+ * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
164
+ * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
145
165
  * **Advanced Topics**:
146
- * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
147
- * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
148
- * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
166
+ * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
167
+ * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
168
+ * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
149
169
 
150
- * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
170
+ * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
151
171
 
152
172
  ## [Examples](examples/)
153
173
 
@@ -160,7 +180,7 @@ We recommend you first take a look at the example notebooks showing how to use t
160
180
 
161
181
  ## Documentation
162
182
 
163
- For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
183
+ For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
164
184
 
165
185
  ## License
166
186
 
@@ -58,10 +58,9 @@ async def main():
58
58
  obs, _ = await env.reset() # Gets first observation
59
59
  for i in range(5):
60
60
  actions, done = await agent.predict(obs)
61
- if done:
62
- break
63
-
61
+
64
62
  obs, reward, terminated, info = await env.step(actions)
63
+ if done or terminated: break
65
64
 
66
65
  # Evaluate and close
67
66
  result = await env.evaluate()
@@ -73,22 +72,37 @@ if __name__ == "__main__":
73
72
 
74
73
  ```
75
74
 
75
+ Alternatively, run a full evaluation set via the ```run_job``` command:
76
+
77
+ ```python
78
+ from hud import load_taskset, run_job, ClaudeAgent
79
+
80
+ # load
81
+ taskset = load_taskset("GAIA")
82
+
83
+ # evaluate
84
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
85
+
86
+ # get results OR view them in app.hud.so
87
+ print(await job.get_analytics())
88
+ ```
89
+
76
90
  ## Documentation Sections
77
91
 
78
92
  Explore the core concepts and features of the SDK:
79
93
 
80
- * **[Tasks and TaskSets](/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
81
- * **[Environments](/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
82
- * **[Agents](/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
83
- * **[Adapters](/concepts/adapter)**: See how actions and observations are translated between agents and environments.
84
- * **[Jobs](/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
85
- * **[Trajectories](/concepts/trajectory)**: Understand the recorded data from each agent run.
94
+ * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
95
+ * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
96
+ * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
97
+ * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
98
+ * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
99
+ * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
86
100
  * **Advanced Topics**:
87
- * **[CLA Action Details](/advanced/cla-details)**: Explore the standardized action format.
88
- * **[Custom Environments](/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
89
- * **[Advanced Environment Control](/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
101
+ * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
102
+ * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
103
+ * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
90
104
 
91
- * **[Full API Reference](/api-reference/gym)**: Detailed specifications for all modules and classes.
105
+ * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
92
106
 
93
107
  ## [Examples](examples/)
94
108
 
@@ -101,7 +115,7 @@ We recommend you first take a look at the example notebooks showing how to use t
101
115
 
102
116
  ## Documentation
103
117
 
104
- For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
118
+ For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
105
119
 
106
120
  ## License
107
121
 
@@ -69,6 +69,11 @@ Here are some key CLA types grouped by category:
69
69
  * **`ScreenshotFetch`**: Requests a screenshot (used internally, typically not sent by agents directly).
70
70
  * **`PositionFetch`**: Requests the current cursor position (used internally).
71
71
 
72
+ ### Response Actions
73
+
74
+ * **`ResponseAction`**: Used to submit a final text answer.
75
+ * `text: str`: The final textual response from the agent.
76
+
72
77
  ### Custom Actions
73
78
 
74
79
  * **`CustomAction`**: Allows defining arbitrary actions specific to a custom environment controller.
@@ -48,6 +48,20 @@ env_os = await gym.make("OSWorld-Ubuntu")
48
48
 
49
49
  Environments created this way won't have a default `Task` associated unless you explicitly reset them with one later using `env.reset()`. The `gym.make()` function also automatically links the environment to an active [Job](/concepts/job) if one was defined using the `@job` decorator.
50
50
 
51
+ ## Available Environment Types
52
+
53
+ The HUD SDK provides several standard environment types, specified via the `gym` attribute in a [Task](/concepts/task) or directly in `hud.gym.make()`:
54
+
55
+ * **`"hud-browser"`**: Provides a remote Chromium browser instance managed via Playwright. Ideal for web navigation, form interaction, and testing web applications.
56
+ * [See `hud-browser` Details](../environments/hud-browser.mdx)
57
+ * **`"hud-ubuntu"`**: Provides a remote Ubuntu desktop environment accessed via VNC. Suitable for tasks involving GUI applications, file system interaction, or running Linux software.
58
+ * [See `hud-ubuntu` Details](../environments/hud-ubuntu.mdx)
59
+ * **`"qa"`**: A non-interactive environment for question-answering tasks where the agent provides a direct textual response.
60
+ * [See `qa` Environment Details](../environments/qa.mdx)
61
+ * **`CustomGym`**: Allows defining and running your own [Custom Environments](../advanced/custom-environments.mdx) using Docker, either locally or remotely. This provides maximum flexibility for specific testing needs.
62
+
63
+ The `gym` attribute in a Task tells `hud.gym.make()` which environment to instantiate.
64
+
51
65
  ## Interaction Loop
52
66
 
53
67
  The standard interaction flow involves the [Agent](/concepts/agent) and the Environment:
@@ -30,11 +30,10 @@ task = Task(
30
30
  prompt="Log in to example.com with username 'test'",
31
31
  gym="hud-browser", # Request a browser environment
32
32
  setup=[ # Actions run by gym.make(task)
33
- ("goto", "https://example.com/login"),
34
- {"function": "wait_for_element", "args": ["#username"]}
33
+ ("goto", "https://example.com/login")
35
34
  ],
36
35
  evaluate={ # Logic run by env.evaluate()
37
- "function": "check_login_status",
36
+ "function": "page_contains",
38
37
  "args": ["test"]
39
38
  }
40
39
  )
@@ -82,11 +81,13 @@ Load predefined sets from the HUD platform:
82
81
  ```python
83
82
  from hud import load_taskset
84
83
 
85
- taskset = await load_taskset("OSWorld-Ubuntu-Links")
84
+ taskset = await load_taskset("OSWorld-Ubuntu")
86
85
  print(f"Number of tasks: {len(taskset)}") # TaskSet acts like a list
87
86
  first_task = taskset[0]
88
87
  ```
89
88
 
89
+ Currently supported TaskSets available via `load_taskset` include OSWorld, GAIA, and WebVoyager subsets.
90
+
90
91
  ### Creating a TaskSet Manually
91
92
 
92
93
  ```python
@@ -29,6 +29,14 @@
29
29
  "concepts/trajectory"
30
30
  ]
31
31
  },
32
+ {
33
+ "group": "Environments",
34
+ "pages": [
35
+ "environments/hud-browser",
36
+ "environments/hud-ubuntu",
37
+ "environments/qa"
38
+ ]
39
+ },
32
40
  {
33
41
  "group": "Advanced Topics",
34
42
  "pages": [
@@ -0,0 +1,67 @@
1
+ # HUD Browser Environment
2
+
3
+ ## Introduction
4
+
5
+ The `hud-browser` environment provides a remote Chromium browser instance, managed by Playwright, for agents to interact with websites. It's ideal for tasks involving web navigation, form filling, information retrieval, and testing web applications.
6
+
7
+ ## Setup
8
+
9
+ Setup actions for the `hud-browser` are defined in the `setup` attribute of a [Task](../concepts/task.mdx) and executed by `hud.gym.make()`. They typically involve browser controller functions.
10
+
11
+ * **`goto(url: str)`**: Navigates the browser to the specified `url`. Automatically prepends `http://` if no scheme is provided. Waits for `domcontentloaded` (up to 10s timeout) and adds a 1s wait for rendering.
12
+ ```python
13
+ # Example Task Setup:
14
+ setup=[("goto", "https://google.com")]
15
+ ```
16
+ * **Other common setup functions coming soon:** `wait_for_element`, `click`, `type`, `set_cookies` etc.
17
+
18
+ Refer to [Task Setup Configuration](../concepts/task.mdx#setup-configuration) for how to define these.
19
+
20
+ ## Step Interaction
21
+
22
+ Agents interact with the browser environment by sending a list of [CLA Actions](../advanced/cla-details.mdx) to `env.step()`. An [Adapter](../concepts/adapter.mdx) typically handles the conversion from the agent model's output to the CLA format.
23
+
24
+ Common CLAs used with `hud-browser`:
25
+ * [`ClickAction`](../advanced/cla-details.mdx#mouse-actions)
26
+ * [`MoveAction`](../advanced/cla-details.mdx#mouse-actions)
27
+ * [`TypeAction`](../advanced/cla-details.mdx#keyboard-actions)
28
+ * [`PressAction`](../advanced/cla-details.mdx#keyboard-actions)
29
+ * [`ScrollAction`](../advanced/cla-details.mdx#mouse-actions)
30
+ * [`DragAction`](../advanced/cla-details.mdx#mouse-actions)
31
+ * [`ResponseAction`](../advanced/cla-details.mdx#response-actions) (to submit a final text answer)
32
+
33
+ *See [CLA Action Details](../advanced/cla-details.mdx) for the full specification.*
34
+
35
+ ## Evaluate
36
+
37
+ The `evaluate` attribute of a [Task](../concepts/task.mdx) defines how success is measured using `env.evaluate()`. This calls functions within the browser controller.
38
+
39
+ Built-in evaluation functions for `hud-browser`:
40
+
41
+ * **`url_match(expected_url: str)`**: Checks if the current browser URL exactly matches `expected_url`. Returns `1.0` for a match, `0.0` otherwise.
42
+ ```python
43
+ # Example Task Evaluation:
44
+ evaluate=("url_match", "https://google.com/search?q=expected")
45
+ ```
46
+ * **`page_contains(texts: list[str])`** (alias `contains_text`): Checks if *all* strings in `texts` are present in `page.content()`. Returns `1.0` if all texts are found, `0.0` otherwise.
47
+ ```python
48
+ # Example Task Evaluation:
49
+ evaluate=("page_contains", ["Search Results", "About 1,000,000 results"])
50
+ ```
51
+ * **`sheet_contains(texts: list[str])`**: Custom function for Google Sheets. Returns `1.0` if any text is found, `0.0` otherwise.
52
+ ```python
53
+ # Example Task Evaluation:
54
+ evaluate=("sheet_contains", ["Expected value in cell A1"])
55
+ ```
56
+ * **`cookie_exists(cookie_names: list[str])`**: Checks if all cookies in `cookie_names` exist in `context.cookies()`. Returns `1.0` if all exist, `0.0` otherwise.
57
+ ```python
58
+ # Example Task Evaluation:
59
+ evaluate=("cookie_exists", ["session_id", "user_pref"])
60
+ ```
61
+ * **`cookie_match(name_value_pairs: list[str])`**: Checks if cookies exist *and* match expected values. `name_value_pairs` format: `[name1, value1, name2, value2, ...]`. Returns `1.0` if all match, `0.0` otherwise.
62
+ ```python
63
+ # Example Task Evaluation:
64
+ evaluate=("cookie_match", ["user_id", "12345", "theme", "dark"])
65
+ ```
66
+
67
+ Refer to [Task Evaluation Configuration](../concepts/task.mdx#evaluation-configuration) for more details.
@@ -0,0 +1,55 @@
1
+ # HUD Ubuntu Environment
2
+
3
+ ## Introduction
4
+
5
+ The `hud-ubuntu` environment provides a remote Ubuntu OS instance with a graphical desktop, accessed via a VNC connection displayed in the browser. It's suitable for tasks requiring interaction with GUI applications, the file system, or running specific software within a Linux desktop environment.
6
+
7
+ ## Setup
8
+
9
+ The environment setup simply launches the Ubuntu desktop session within the VNC viewer.
10
+
11
+ *Specific pre-launch setup functions (e.g., pre-installing packages, setting environment variables) are planned for future releases.*
12
+
13
+ Currently, any necessary setup (like installing software or creating files) must be performed by the agent *after* the environment starts, using standard interaction actions.
14
+
15
+ Refer to [Task Setup Configuration](../concepts/task.mdx#setup-configuration) for the general concept of how setup steps *could* be defined in a Task, although they are not currently implemented for this specific environment.
16
+
17
+ ## Step Interaction
18
+
19
+ Agents interact with the Ubuntu desktop environment by sending a list of [CLA Actions](../advanced/cla-details.mdx) to `env.step()`. An [Adapter](../concepts/adapter.mdx) typically handles the conversion from the agent model's output to the CLA format.
20
+
21
+ Available CLA actions for interacting with the graphical desktop:
22
+
23
+ **Keyboard Actions:**
24
+ * [`TypeAction`](../advanced/cla-details.mdx#keyboard-actions): Simulates typing text into the focused application or window element.
25
+ ```python
26
+ # Example: Typing into a text editor
27
+ TypeAction(text="Hello, Ubuntu!")
28
+ ```
29
+ * [`PressAction`](../advanced/cla-details.mdx#keyboard-actions): For sending hotkeys (e.g., `Ctrl+C`, `Alt+F4`).
30
+ * [`KeyDownAction` / `KeyUpAction`](../advanced/cla-details.mdx#keyboard-actions): For holding/releasing modifier keys (e.g., holding Shift while clicking).
31
+
32
+ **Mouse Actions:**
33
+ * [`ClickAction`](../advanced/cla-details.mdx#mouse-actions): To click on GUI elements (buttons, icons, menus, etc.).
34
+ * [`MoveAction`](../advanced/cla-details.mdx#mouse-actions): To move the mouse cursor to specific coordinates or elements.
35
+ * [`ScrollAction`](../advanced/cla-details.mdx#mouse-actions): To scroll within windows or applications.
36
+ * [`DragAction`](../advanced/cla-details.mdx#mouse-actions): To perform drag-and-drop operations.
37
+
38
+ **Control & Response Actions:**
39
+ * [`WaitAction`](../advanced/cla-details.mdx#control-actions): To introduce pauses if needed.
40
+ * [`ResponseAction`](../advanced/cla-details.mdx#response-actions): Used by the agent to submit its final answer or result text.
41
+
42
+ **Other Actions (less common for direct agent use):**
43
+ * [`ScreenshotFetch`](../advanced/cla-details.mdx#fetch-actions-get-information): Usually handled internally by the environment/agent loop.
44
+ * [`PositionFetch`](../advanced/cla-details.mdx#fetch-actions-get-information): Usually handled internally.
45
+ * [`CustomAction`](../advanced/cla-details.mdx#custom-actions): For potential future custom environment extensions.
46
+
47
+ *See [CLA Action Details](../advanced/cla-details.mdx) for the full specification of each action and its parameters.*
48
+
49
+ ## Evaluate
50
+
51
+ *Specific evaluation functions for `hud-ubuntu` (e.g., checking file content, application state, process status) are planned for future releases.*
52
+
53
+ Currently, evaluation often relies on the agent submitting a final answer via `ResponseAction`, which can then be checked using generic QA evaluators defined in the [Task](../concepts/task.mdx) (like `response_includes`, `response_matches`). Alternatively, evaluation might involve visually inspecting the final state via the VNC connection or checking logs if the agent was tasked with producing specific output.
54
+
55
+ Refer to [Task Evaluation Configuration](../concepts/task.mdx#evaluation-configuration) for the general concept of defining evaluation steps.
@@ -0,0 +1,68 @@
1
+ # QA Environment
2
+
3
+ ## Introduction
4
+
5
+ The `qa` environment is a specialized, non-interactive environment designed for question-answering tasks. The agent receives context or a question via the `Task.prompt` and is expected to provide a final text response.
6
+
7
+ ## Setup
8
+
9
+ No environment-specific setup actions are typically required for `qa` tasks. The question or context is provided directly in the `Task.prompt`.
10
+
11
+ ```python
12
+ from hud.task import Task
13
+
14
+ qa_task = Task(
15
+ prompt="What is the powerhouse of the cell?",
16
+ gym="qa" # Specify the QA environment
17
+ # evaluate=... (see below)
18
+ )
19
+ ```
20
+
21
+ Refer to [Task Setup Configuration](../concepts/task.mdx#setup-configuration) for the general concept of how setup steps *could* be defined in a Task, although they are generally not needed for `qa`.
22
+
23
+ ## Step Interaction
24
+
25
+ Agents interact with the `qa` environment primarily by submitting their final answer.
26
+
27
+ * The agent receives the `Task.prompt` in the initial `Observation`.
28
+ * The agent processes the prompt and determines its answer.
29
+ * The agent sends a single [`ResponseAction`](../advanced/cla-details.mdx#response-actions) containing the answer text to `env.step()`.
30
+
31
+ ```python
32
+ # Agent predicts and sends this action:
33
+ from hud.adapters.common.types import ResponseAction
34
+
35
+ action = ResponseAction(text="Mitochondria")
36
+ # await env.step([action])
37
+ ```
38
+
39
+ The environment stores the text from the first `ResponseAction` it receives in an internal `env.final_response` attribute for evaluation.
40
+
41
+ **Other CLAs:** While technically part of the [CLA standard](../advanced/cla-details.mdx), other actions (like `ClickAction`, `TypeAction`, `ScrollAction`, etc.) are not processed or relevant in the standard `qa` environment.
42
+
43
+ ## Evaluate
44
+
45
+ Evaluation logic is defined in the `evaluate` attribute of the [Task](../concepts/task.mdx) and triggered by `env.evaluate()`. This logic compares the `env.final_response` (the text submitted by the agent via `ResponseAction`) against expected criteria.
46
+
47
+ Common evaluation methods for `qa` tasks:
48
+ * **`response_includes(substring: str | list[str])`**: Checks if the response text contains the specified `substring` or *all* of the substrings in the provided list.
49
+ ```python
50
+ # Example Task Evaluation (single string):
51
+ evaluate=("response_includes", "mitochondria")
52
+ # Example Task Evaluation (list of strings):
53
+ # evaluate=("response_includes", ["powerhouse", "cell"])
54
+ ```
55
+ * **`response_is(expected_text: str)`**: Checks for an exact, case-sensitive match with the `expected_text`.
56
+ ```python
57
+ # Example Task Evaluation:
58
+ evaluate=("response_is", "Mitochondria")
59
+ ```
60
+ * **`response_match(pattern: str)`**: Checks if the response text matches the provided regular expression `pattern`.
61
+ ```python
62
+ # Example Task Evaluation:
63
+ evaluate=("response_match", r"^[Mm]itochondria.?$") # Case-insensitive, optional period
64
+ ```
65
+
66
+ *Note: The exact names and availability of evaluation functions might evolve. Refer to specific evaluator documentation or examples for the most current details.*
67
+
68
+ Refer to [Task Evaluation Configuration](../concepts/task.mdx#evaluation-configuration) for more details on defining evaluation logic.
@@ -71,7 +71,7 @@ async def main():
71
71
  # 4. Interaction Loop: Agent observes and acts
72
72
  print("Starting interaction loop...")
73
73
  # Get initial observation (screenshot, text, etc.) by stepping without actions
74
- obs, _ = env.reset()
74
+ obs, _ = await env.reset()
75
75
 
76
76
  for i in range(5): # Limit to 5 steps for this example
77
77
  print(f"--- Step {i+1} ---")
@@ -89,10 +89,6 @@ async def main():
89
89
  result = await env.evaluate() # Run the evaluation defined in the Task
90
90
  print(f"Evaluation result: {result}")
91
91
 
92
- # Trajectory is automatically saved if a @job decorator is used
93
- # trajectory = await env.get_trajectory() # You can optionally get trajectory data
94
- # print(f"Trajectory ID: {trajectory.id}")
95
-
96
92
  print("Closing environment...")
97
93
  await env.close() # Clean up environment resources
98
94