hud-python 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (165) hide show
  1. {hud_python-0.2.5 → hud_python-0.2.6}/PKG-INFO +18 -18
  2. {hud_python-0.2.5 → hud_python-0.2.6}/README.md +17 -17
  3. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/environment.mdx +0 -2
  4. {hud_python-0.2.5 → hud_python-0.2.6}/docs/docs.json +0 -1
  5. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environment-creation.mdx +23 -1
  6. {hud_python-0.2.5 → hud_python-0.2.6}/docs/quickstart.mdx +5 -2
  7. hud_python-0.2.6/examples/osworld.ipynb +199 -0
  8. {hud_python-0.2.5 → hud_python-0.2.6}/examples/tasks.ipynb +4 -11
  9. hud_python-0.2.6/examples/wordle_example.ipynb +244 -0
  10. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/claude_plays_pokemon.py +2 -1
  11. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/remote_docker_client.py +2 -2
  12. {hud_python-0.2.5 → hud_python-0.2.6}/hud/job.py +9 -9
  13. {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/requests.py +26 -4
  14. {hud_python-0.2.5 → hud_python-0.2.6}/hud/settings.py +1 -1
  15. {hud_python-0.2.5 → hud_python-0.2.6}/hud/taskset.py +16 -4
  16. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/context.py +33 -57
  17. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/mcp.py +0 -3
  18. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/test_context.py +7 -3
  19. {hud_python-0.2.5 → hud_python-0.2.6}/hud/types.py +1 -1
  20. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_version.py +1 -1
  21. {hud_python-0.2.5 → hud_python-0.2.6}/hud/version.py +1 -1
  22. {hud_python-0.2.5 → hud_python-0.2.6}/pyproject.toml +1 -1
  23. hud_python-0.2.5/docs/environments/ubuntu.mdx +0 -118
  24. hud_python-0.2.5/examples/osworld.ipynb +0 -240
  25. {hud_python-0.2.5 → hud_python-0.2.6}/.env.example +0 -0
  26. {hud_python-0.2.5 → hud_python-0.2.6}/.github/workflows/ci.yml +0 -0
  27. {hud_python-0.2.5 → hud_python-0.2.6}/.github/workflows/release.yml +0 -0
  28. {hud_python-0.2.5 → hud_python-0.2.6}/.gitignore +0 -0
  29. {hud_python-0.2.5 → hud_python-0.2.6}/LICENSE +0 -0
  30. {hud_python-0.2.5 → hud_python-0.2.6}/MANIFEST.in +0 -0
  31. {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/cla-details.mdx +0 -0
  32. {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/environment-control.mdx +0 -0
  33. {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/tracing.mdx +0 -0
  34. {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/uploading.mdx +0 -0
  35. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/adapters.mdx +0 -0
  36. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/env.mdx +0 -0
  37. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/gym.mdx +0 -0
  38. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/job.mdx +0 -0
  39. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/task.mdx +0 -0
  40. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/taskset.mdx +0 -0
  41. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/telemetry.mdx +0 -0
  42. {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/trajectory.mdx +0 -0
  43. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/adapter.mdx +0 -0
  44. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/agent.mdx +0 -0
  45. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/job.mdx +0 -0
  46. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/task.mdx +0 -0
  47. {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/trajectory.mdx +0 -0
  48. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/browser.mdx +0 -0
  49. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/custom-environments.mdx +0 -0
  50. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/custom.mdx +0 -0
  51. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/osworld-ubuntu.mdx +0 -0
  52. {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/qa.mdx +0 -0
  53. {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/alignment-evaluation.mdx +0 -0
  54. {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/benchmarking-agents.mdx +0 -0
  55. {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/custom-os-env.mdx +0 -0
  56. {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/mcp-agent-tracing.mdx +0 -0
  57. {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/web-app-testing.mdx +0 -0
  58. {hud_python-0.2.5 → hud_python-0.2.6}/docs/favicon.png +0 -0
  59. {hud_python-0.2.5 → hud_python-0.2.6}/docs/logo/hud_logo.svg +0 -0
  60. {hud_python-0.2.5 → hud_python-0.2.6}/docs/logo/hud_logo_dark.svg +0 -0
  61. {hud_python-0.2.5 → hud_python-0.2.6}/docs/running-your-agent.mdx +0 -0
  62. {hud_python-0.2.5 → hud_python-0.2.6}/docs/task-creation.mdx +0 -0
  63. {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/Dockerfile +0 -0
  64. {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/pyproject.toml +0 -0
  65. {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
  66. {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
  67. {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
  68. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/Dockerfile +0 -0
  69. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/pyproject.toml +0 -0
  70. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
  71. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
  72. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
  73. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
  74. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
  75. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
  76. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
  77. {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
  78. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/Dockerfile +0 -0
  79. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/pyproject.toml +0 -0
  80. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
  81. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
  82. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
  83. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/info.py +0 -0
  84. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
  85. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
  86. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/step.py +0 -0
  87. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
  88. {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
  89. {hud_python-0.2.5 → hud_python-0.2.6}/examples/README.md +0 -0
  90. {hud_python-0.2.5 → hud_python-0.2.6}/examples/browser_use.ipynb +0 -0
  91. /hud_python-0.2.5/examples/example.ipynb → /hud_python-0.2.6/examples/custom_task_example.ipynb +0 -0
  92. {hud_python-0.2.5 → hud_python-0.2.6}/examples/jobs.ipynb +0 -0
  93. {hud_python-0.2.5 → hud_python-0.2.6}/examples/local.ipynb +0 -0
  94. {hud_python-0.2.5 → hud_python-0.2.6}/examples/mcp_test.ipynb +0 -0
  95. {hud_python-0.2.5 → hud_python-0.2.6}/examples/pokemon_local.ipynb +0 -0
  96. {hud_python-0.2.5 → hud_python-0.2.6}/examples/pokemon_remote.ipynb +0 -0
  97. {hud_python-0.2.5 → hud_python-0.2.6}/examples/remote.ipynb +0 -0
  98. {hud_python-0.2.5 → hud_python-0.2.6}/hud/__init__.py +0 -0
  99. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/__init__.py +0 -0
  100. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/__init__.py +0 -0
  101. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/adapter.py +0 -0
  102. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/tests/__init__.py +0 -0
  103. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/tests/test_adapter.py +0 -0
  104. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/__init__.py +0 -0
  105. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/adapter.py +0 -0
  106. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/tests/__init__.py +0 -0
  107. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/tests/test_adapter.py +0 -0
  108. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/types.py +0 -0
  109. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/__init__.py +0 -0
  110. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/adapter.py +0 -0
  111. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/tests/__init__.py +0 -0
  112. {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/tests/test_adapter.py +0 -0
  113. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/__init__.py +0 -0
  114. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/base.py +0 -0
  115. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/claude.py +0 -0
  116. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/langchain.py +0 -0
  117. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/misc/__init__.py +0 -0
  118. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/misc/response_agent.py +0 -0
  119. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/operator.py +0 -0
  120. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/tests/__init__.py +0 -0
  121. {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/tests/test_base.py +0 -0
  122. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/__init__.py +0 -0
  123. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/client.py +0 -0
  124. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/docker_client.py +0 -0
  125. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/environment.py +0 -0
  126. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/local_docker_client.py +0 -0
  127. {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/remote_client.py +0 -0
  128. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/__init__.py +0 -0
  129. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/base.py +0 -0
  130. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/inspect.py +0 -0
  131. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/judge.py +0 -0
  132. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/match.py +0 -0
  133. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/remote.py +0 -0
  134. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/__init__.py +0 -0
  135. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_inspect.py +0 -0
  136. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_judge.py +0 -0
  137. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_match.py +0 -0
  138. {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_remote.py +0 -0
  139. {hud_python-0.2.5 → hud_python-0.2.6}/hud/exceptions.py +0 -0
  140. {hud_python-0.2.5 → hud_python-0.2.6}/hud/gym.py +0 -0
  141. {hud_python-0.2.5 → hud_python-0.2.6}/hud/py.typed +0 -0
  142. {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/__init__.py +0 -0
  143. {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/tests/__init__.py +0 -0
  144. {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/tests/test_requests.py +0 -0
  145. {hud_python-0.2.5 → hud_python-0.2.6}/hud/task.py +0 -0
  146. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/__init__.py +0 -0
  147. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/_trace.py +0 -0
  148. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/exporter.py +0 -0
  149. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/__init__.py +0 -0
  150. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/registry.py +0 -0
  151. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/mcp_models.py +0 -0
  152. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/__init__.py +0 -0
  153. {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/test_trace.py +0 -0
  154. {hud_python-0.2.5 → hud_python-0.2.6}/hud/trajectory.py +0 -0
  155. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/__init__.py +0 -0
  156. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/common.py +0 -0
  157. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/config.py +0 -0
  158. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/misc.py +0 -0
  159. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/progress.py +0 -0
  160. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/telemetry.py +0 -0
  161. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/__init__.py +0 -0
  162. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_common.py +0 -0
  163. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_config.py +0 -0
  164. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_progress.py +0 -0
  165. {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -74,17 +74,17 @@ Description-Content-Type: text/markdown
74
74
  </div>
75
75
 
76
76
  <h3>
77
- Create, evaluate, and improve AI agents across web browsers, desktop environments, and custom scenarios.
77
+ Evaluate your Computer Use AI agents across web browsers, desktop environments, and custom scenarios.
78
78
  </h3>
79
79
 
80
- > ### 🚀 Are you a startup building agents?
81
- >
82
- > [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
83
- >
84
- > We're here to help with eval strategies, custom environments, or improving your agent architecture!
80
+ ### 🚀 Are you a startup building agents?
85
81
 
82
+ [📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
86
83
 
87
- > **Early Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
84
+ We're here to help with eval strategies, custom environments, or improving your agent architecture!
85
+
86
+
87
+ > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
88
88
 
89
89
  [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
90
90
 
@@ -132,23 +132,23 @@ with hud.trace("my-agent-run"):
132
132
  result = await agent.run(task)
133
133
  ```
134
134
 
135
- ## API Key Setup
136
-
137
- Before getting started, you'll need to obtain an API key:
135
+ ## Quick Start
138
136
 
139
- 1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
140
- 2. Set it in your environment or .env file:
137
+ ### Installation
141
138
 
142
139
  ```bash
143
- export HUD_API_KEY=your_api_key_here
140
+ pip install hud-python
144
141
  ```
145
142
 
146
- ## Quick Start
143
+ ### API Key Setup
147
144
 
148
- ### Installation
145
+ Before getting started, you'll need to obtain an API key:
146
+
147
+ 1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
148
+ 2. Set it in your environment or .env file:
149
149
 
150
150
  ```bash
151
- pip install hud-python
151
+ export HUD_API_KEY=your_api_key_here
152
152
  ```
153
153
 
154
154
  ### Simple Browser Example with Claude Computer Use
@@ -269,4 +269,4 @@ If you use this SDK in your research, please cite it as follows:
269
269
  url = {https://github.com/hud-evals/hud-sdk},
270
270
  langid = {en}
271
271
  }
272
- ```
272
+ ```
@@ -3,17 +3,17 @@
3
3
  </div>
4
4
 
5
5
  <h3>
6
- Create, evaluate, and improve AI agents across web browsers, desktop environments, and custom scenarios.
6
+ Evaluate your Computer Use AI agents across web browsers, desktop environments, and custom scenarios.
7
7
  </h3>
8
8
 
9
- > ### 🚀 Are you a startup building agents?
10
- >
11
- > [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
12
- >
13
- > We're here to help with eval strategies, custom environments, or improving your agent architecture!
9
+ ### 🚀 Are you a startup building agents?
14
10
 
11
+ [📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
15
12
 
16
- > **Early Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
13
+ We're here to help with eval strategies, custom environments, or improving your agent architecture!
14
+
15
+
16
+ > **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
17
17
 
18
18
  [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
19
19
 
@@ -61,23 +61,23 @@ with hud.trace("my-agent-run"):
61
61
  result = await agent.run(task)
62
62
  ```
63
63
 
64
- ## API Key Setup
65
-
66
- Before getting started, you'll need to obtain an API key:
64
+ ## Quick Start
67
65
 
68
- 1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
69
- 2. Set it in your environment or .env file:
66
+ ### Installation
70
67
 
71
68
  ```bash
72
- export HUD_API_KEY=your_api_key_here
69
+ pip install hud-python
73
70
  ```
74
71
 
75
- ## Quick Start
72
+ ### API Key Setup
76
73
 
77
- ### Installation
74
+ Before getting started, you'll need to obtain an API key:
75
+
76
+ 1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
77
+ 2. Set it in your environment or .env file:
78
78
 
79
79
  ```bash
80
- pip install hud-python
80
+ export HUD_API_KEY=your_api_key_here
81
81
  ```
82
82
 
83
83
  ### Simple Browser Example with Claude Computer Use
@@ -198,4 +198,4 @@ If you use this SDK in your research, please cite it as follows:
198
198
  url = {https://github.com/hud-evals/hud-sdk},
199
199
  langid = {en}
200
200
  }
201
- ```
201
+ ```
@@ -54,8 +54,6 @@ The HUD SDK provides several standard environment types, specified via the `gym`
54
54
 
55
55
  * **`"hud-browser"`**: Provides a remote Chromium browser instance managed via Playwright. Ideal for web navigation, form interaction, and testing web applications.
56
56
  * [See `hud-browser` Details](../environments/hud-browser.mdx)
57
- * **`"hud-ubuntu"`**: Provides a remote Ubuntu desktop environment accessed via VNC. Suitable for tasks involving GUI applications, file system interaction, or running Linux software.
58
- * [See `hud-ubuntu` Details](../environments/hud-ubuntu.mdx)
59
57
  * **`"qa"`**: A non-interactive environment for question-answering tasks where the agent provides a direct textual response.
60
58
  * [See `qa` Environment Details](../environments/qa.mdx)
61
59
  * **`CustomGym`**: Allows defining and running your own [Custom Environments](../advanced/custom-environments.mdx) using Docker, either locally or remotely. This provides maximum flexibility for specific testing needs.
@@ -55,7 +55,6 @@
55
55
  "environments/browser",
56
56
  "environments/custom",
57
57
  "environments/qa",
58
- "environments/ubuntu",
59
58
  "environments/osworld-ubuntu"
60
59
  ]
61
60
  }
@@ -63,8 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
63
63
  # ... other system dependencies for your environment (e.g., desktop, browsers) ...
64
64
  && rm -rf /var/lib/apt/lists/*
65
65
 
66
+ # Upgrade pip and setuptools to ensure PEP 660 support
67
+ RUN pip3 install --upgrade pip setuptools>=64.0.0 wheel
68
+
66
69
  # Copy your controller source code
67
70
  WORKDIR /app
71
+ RUN mkdir /app_data
72
+
68
73
  COPY ./src /app/src
69
74
  COPY ./pyproject.toml /app/
70
75
 
@@ -92,7 +97,7 @@ dependencies = [
92
97
  ]
93
98
 
94
99
  [build-system]
95
- requires = ["setuptools>=61.0"]
100
+ requires = ["setuptools>=64.0.0", "wheel"]
96
101
  build-backend = "setuptools.build_meta"
97
102
 
98
103
  [project.scripts]
@@ -135,9 +140,26 @@ def verify_output_file(expected_content: str) -> float:
135
140
  logger.error("Evaluation failed: Output file not found.")
136
141
  return 0.0 # Failure
137
142
 
143
+ def step(action: str) -> str:
144
+ """Example step function for a Task."""
145
+ logger.info(f"Controller: Stepping with {action=}")
146
+
147
+ return {
148
+ "observation": {
149
+ "text": "Sample Text",
150
+ "screenshot": None
151
+ }
152
+ }
153
+
138
154
  # You can add more functions as needed for different setup/evaluation logic
139
155
  ```
140
156
 
157
+ ### d. `src/hud_controller/__init__.py`
158
+
159
+ ```python
160
+ from .main import initialize_environment, verify_output_file, step
161
+ ```
162
+
141
163
  ## 4. Building & Testing Locally
142
164
 
143
165
  ### a. Define `CustomGym`
@@ -15,7 +15,7 @@ See [Installation](/installation) for more details on development setup.
15
15
 
16
16
  ## 2. API Key Setup
17
17
 
18
- Set your API keys in a `.env` file:
18
+ Set your API keys in a `.env` file (get your HUD API key from [app.hud.so](https://app.hud.so)):
19
19
 
20
20
  ```bash
21
21
  HUD_API_KEY=sk-hud-...
@@ -51,9 +51,12 @@ async def main():
51
51
  await env.close()
52
52
 
53
53
  if __name__ == "__main__":
54
- asyncio.run(main())
54
+ asyncio.run(main())
55
55
  ```
56
56
 
57
+ Each gym (`hud-browser`, `OSWorld-Ubuntu`, custom) has it's own set of setup and evaluate funcitons, and you can define your own.
58
+ See [setup](/environments/browser#setup-functions-initial-state) and [evalutors](/environments/browser#evaluation-functions) for more info on available functions.
59
+
57
60
  ### Manual Agent Loop
58
61
  ```python
59
62
  env = await gym.make(task)
@@ -0,0 +1,199 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# uv pip install -e \".[dev]\"\n",
10
+ "from hud import gym, load_taskset\n",
11
+ "from pprint import pprint"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Total tasks in OSWorld: 369\n",
24
+ "Task prompt: Can you make my computer bring back the last tab I shut down?\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "taskset = await load_taskset(\"OSWorld-Ubuntu\")\n",
30
+ "print(f\"Total tasks in OSWorld: {len(taskset)}\")\n",
31
+ "\n",
32
+ "test = taskset[144]\n",
33
+ "print(f\"Task prompt: {test.prompt}\")"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "metadata": {},
40
+ "outputs": [
41
+ {
42
+ "name": "stderr",
43
+ "output_type": "stream",
44
+ "text": [
45
+ "2025-05-27 10:04:56,691 - hud.gym - INFO - Creating private environment\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "# The Ubuntu environment will take around 2.5 minutes to start, but can be parallelized\n",
51
+ "env = await gym.make(test)"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": 4,
57
+ "metadata": {},
58
+ "outputs": [
59
+ {
60
+ "name": "stdout",
61
+ "output_type": "stream",
62
+ "text": [
63
+ "Initial observation complete\n",
64
+ "========= Step 1 =========\n",
65
+ "Agent's action: [PressAction(type='press', keys=['ctrl', 'shift', 't'])]\n",
66
+ "========= Step 2 =========\n",
67
+ "Agent's action: [ResponseAction(type='response', text=\"Great! I've successfully reopened your last closed tab. As you can see, the TripAdvisor tab has been restored. Now you have three tabs open:\\n\\n1. Lonely Planet | Travel Guide\\n2. Airbnb | Vacation rentals\\n3. TripAdvisor: Over a billion reviews & contributions for Hotels\\n\\nThe keyboard shortcut Ctrl+Shift+T is very useful for recovering recently closed tabs in Chrome. You can actually press it multiple times to continue reopening previously closed tabs in the order they were closed.\")]\n"
68
+ ]
69
+ }
70
+ ],
71
+ "source": [
72
+ "from hud.agent import ClaudeAgent\n",
73
+ "\n",
74
+ "# Define a new agent each time to reset the message history\n",
75
+ "# Make sure to define the environment variable ANTHROPIC_API_KEY\n",
76
+ "agent = ClaudeAgent()\n",
77
+ "\n",
78
+ "# Initial observation\n",
79
+ "obs, _ = await env.reset()\n",
80
+ "print(f\"Initial observation complete\")\n",
81
+ "\n",
82
+ "# Agent loop\n",
83
+ "for i in range(8):\n",
84
+ " print(f\"========= Step {i + 1} =========\")\n",
85
+ " action, done = await agent.predict(obs)\n",
86
+ " print(f\"Agent's action: {action}\")\n",
87
+ "\n",
88
+ " obs, reward, terminated, info = await env.step(action)\n",
89
+ "\n",
90
+ " if done or terminated:\n",
91
+ " break"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 5,
97
+ "metadata": {},
98
+ "outputs": [
99
+ {
100
+ "name": "stdout",
101
+ "output_type": "stream",
102
+ "text": [
103
+ "{'error': None,\n",
104
+ " 'logs': 'INFO: Starting evaluation...\\n'\n",
105
+ " 'INFO: Evaluating task 08d9a8b1-7b7a-4ba7-a226-4e266e13f6df...\\n'\n",
106
+ " 'INFO: Evaluator configuration:\\n'\n",
107
+ " 'INFO: Metric function(s): is_expected_tabs\\n'\n",
108
+ " 'INFO: Metric conjunction: and\\n'\n",
109
+ " 'INFO: Result getter: get_open_tabs_info\\n'\n",
110
+ " 'INFO: Expected getter: get_rule\\n'\n",
111
+ " 'INFO: Metric options: {}\\n'\n",
112
+ " 'INFO: Setting up post-config for evaluation...\\n'\n",
113
+ " 'INFO: Evaluating single metric: is_expected_tabs\\n'\n",
114
+ " \"INFO: Getting result state using config: {'type': 'open_tabs_info'}\\n\"\n",
115
+ " \"INFO: Getting expected state using config: {'type': 'rule', 'rules': \"\n",
116
+ " \"{'type': 'url', 'urls': ['https://www.lonelyplanet.com', \"\n",
117
+ " \"'https://www.airbnb.com', 'https://www.tripadvisor.com']}}\\n\"\n",
118
+ " 'INFO: Comparing result state with expected state\\n'\n",
119
+ " 'INFO: Final evaluation result: 1\\n'\n",
120
+ " 'INFO: Completed evaluation.\\n'\n",
121
+ " 'INFO: Completed evaluation.\\n',\n",
122
+ " 'reward': 1.0}\n"
123
+ ]
124
+ }
125
+ ],
126
+ "source": [
127
+ "# Evaluate environment state\n",
128
+ "result = await env.evaluate()\n",
129
+ "pprint(result)"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 6,
135
+ "metadata": {},
136
+ "outputs": [],
137
+ "source": [
138
+ "# Make sure to close environment to avoid being charged for idle time\n",
139
+ "await env.close()"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "markdown",
144
+ "metadata": {},
145
+ "source": [
146
+ "Paralell runs for the whole dataset"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "code",
151
+ "execution_count": 26,
152
+ "metadata": {},
153
+ "outputs": [],
154
+ "source": [
155
+ "from hud import run_job\n",
156
+ "\n",
157
+ "taskset = await load_taskset(\"OSWorld-Ubuntu\")\n",
158
+ "job = await run_job(\n",
159
+ " ClaudeAgent,\n",
160
+ " taskset,\n",
161
+ " \"osworld-test\",\n",
162
+ " max_steps_per_task=20,\n",
163
+ " max_concurrent_tasks=20,\n",
164
+ " auto_reply_question=True,\n",
165
+ ")"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": [
174
+ "await job.get_analytics()"
175
+ ]
176
+ }
177
+ ],
178
+ "metadata": {
179
+ "kernelspec": {
180
+ "display_name": ".venv",
181
+ "language": "python",
182
+ "name": "python3"
183
+ },
184
+ "language_info": {
185
+ "codemirror_mode": {
186
+ "name": "ipython",
187
+ "version": 3
188
+ },
189
+ "file_extension": ".py",
190
+ "mimetype": "text/x-python",
191
+ "name": "python",
192
+ "nbconvert_exporter": "python",
193
+ "pygments_lexer": "ipython3",
194
+ "version": "3.12.9"
195
+ }
196
+ },
197
+ "nbformat": 4,
198
+ "nbformat_minor": 2
199
+ }
@@ -7,7 +7,6 @@
7
7
  "outputs": [],
8
8
  "source": [
9
9
  "from hud import gym\n",
10
- "from hud.utils import stream\n",
11
10
  "from hud.task import Task"
12
11
  ]
13
12
  },
@@ -41,10 +40,7 @@
41
40
  "source": [
42
41
  "# Create and set up environment with google, takes around 20 seconds\n",
43
42
  "env = await gym.make(task)\n",
44
- "urls = await env.get_urls()\n",
45
- "\n",
46
- "# Stream the live view\n",
47
- "stream(urls[\"live_url\"])"
43
+ "await env.stream()"
48
44
  ]
49
45
  },
50
46
  {
@@ -127,10 +123,7 @@
127
123
  "source": [
128
124
  "# Create and set up environment with google, takes around 20 seconds\n",
129
125
  "env = await gym.make(task)\n",
130
- "urls = await env.get_urls()\n",
131
- "\n",
132
- "# Stream the live view\n",
133
- "stream(urls[\"live_url\"])"
126
+ "await env.stream()"
134
127
  ]
135
128
  },
136
129
  {
@@ -217,9 +210,9 @@
217
210
  "metadata": {},
218
211
  "outputs": [],
219
212
  "source": [
220
- "from hud.adapters.common.types import ResponseAction\n",
213
+ "from hud import Response\n",
221
214
  "\n",
222
- "await env.step([ResponseAction(text=\"Paris\")])"
215
+ "await env.step([Response(text=\"Paris\")])"
223
216
  ]
224
217
  },
225
218
  {