hud-python 0.2.4__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (182) hide show
  1. {hud_python-0.2.4 → hud_python-0.2.5}/PKG-INFO +90 -22
  2. hud_python-0.2.5/README.md +201 -0
  3. hud_python-0.2.5/docs/advanced/tracing.mdx +129 -0
  4. hud_python-0.2.5/docs/advanced/uploading.mdx +160 -0
  5. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/taskset.mdx +11 -0
  6. hud_python-0.2.5/docs/api-reference/telemetry.mdx +125 -0
  7. hud_python-0.2.5/docs/docs.json +82 -0
  8. hud_python-0.2.5/docs/environment-creation.mdx +356 -0
  9. hud_python-0.2.5/docs/environments/browser.mdx +153 -0
  10. {hud_python-0.2.4/docs/advanced → hud_python-0.2.5/docs/environments}/custom-environments.mdx +27 -19
  11. hud_python-0.2.5/docs/environments/custom.mdx +81 -0
  12. hud_python-0.2.5/docs/environments/osworld-ubuntu.mdx +56 -0
  13. {hud_python-0.2.4 → hud_python-0.2.5}/docs/environments/qa.mdx +4 -0
  14. hud_python-0.2.4/docs/environments/hud-ubuntu.mdx → hud_python-0.2.5/docs/environments/ubuntu.mdx +63 -0
  15. hud_python-0.2.5/docs/examples/alignment-evaluation.mdx +190 -0
  16. hud_python-0.2.5/docs/examples/benchmarking-agents.mdx +232 -0
  17. hud_python-0.2.5/docs/examples/custom-os-env.mdx +333 -0
  18. hud_python-0.2.5/docs/examples/mcp-agent-tracing.mdx +210 -0
  19. hud_python-0.2.5/docs/examples/web-app-testing.mdx +258 -0
  20. hud_python-0.2.5/docs/logo/hud_logo.svg +25 -0
  21. hud_python-0.2.5/docs/logo/hud_logo_dark.svg +46 -0
  22. hud_python-0.2.5/docs/quickstart.mdx +291 -0
  23. {hud_python-0.2.4 → hud_python-0.2.5}/docs/running-your-agent.mdx +3 -2
  24. hud_python-0.2.5/docs/task-creation.mdx +201 -0
  25. hud_python-0.2.5/environments/novnc_ubuntu/Dockerfile +8 -0
  26. {hud_python-0.2.4 → hud_python-0.2.5}/environments/novnc_ubuntu/pyproject.toml +1 -1
  27. {hud_python-0.2.4/environments/novnc_ubuntu/src/novnc_ubuntu → hud_python-0.2.5/environments/novnc_ubuntu/src/hud_controller}/step.py +1 -1
  28. hud_python-0.2.5/environments/pokemon_controller/Dockerfile +11 -0
  29. hud_python-0.2.5/environments/pokemon_controller/pyproject.toml +19 -0
  30. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/__init__.py +8 -0
  31. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/display_adapters.py +113 -0
  32. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/emulator.py +319 -0
  33. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/evaluator.py +65 -0
  34. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/kill.py +61 -0
  35. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/main.py +137 -0
  36. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/setup.py +63 -0
  37. hud_python-0.2.5/environments/pokemon_controller/src/hud_controller/step.py +37 -0
  38. hud_python-0.2.5/environments/qa_controller/Dockerfile +20 -0
  39. {hud_python-0.2.4 → hud_python-0.2.5}/environments/qa_controller/pyproject.toml +2 -1
  40. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/evaluate/matchers.py +1 -1
  41. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/info.py +1 -1
  42. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/setup/question.py +1 -1
  43. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/step.py +1 -1
  44. {hud_python-0.2.4 → hud_python-0.2.5}/examples/README.md +1 -0
  45. {hud_python-0.2.4 → hud_python-0.2.5}/examples/browser_use.ipynb +8 -19
  46. {hud_python-0.2.4 → hud_python-0.2.5}/examples/example.ipynb +0 -9
  47. {hud_python-0.2.4 → hud_python-0.2.5}/examples/local.ipynb +5 -4
  48. hud_python-0.2.5/examples/mcp_test.ipynb +98 -0
  49. hud_python-0.2.5/examples/pokemon_local.ipynb +820 -0
  50. hud_python-0.2.5/examples/pokemon_remote.ipynb +712 -0
  51. hud_python-0.2.5/examples/remote.ipynb +70 -0
  52. hud_python-0.2.5/hud/__init__.py +48 -0
  53. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/claude/adapter.py +9 -2
  54. hud_python-0.2.5/hud/adapters/claude/tests/__init__.py +1 -0
  55. hud_python-0.2.5/hud/adapters/claude/tests/test_adapter.py +519 -0
  56. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/common/types.py +5 -1
  57. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/operator/adapter.py +4 -0
  58. hud_python-0.2.5/hud/adapters/operator/tests/__init__.py +1 -0
  59. hud_python-0.2.5/hud/adapters/operator/tests/test_adapter.py +370 -0
  60. {hud_python-0.2.4 → hud_python-0.2.5}/hud/agent/__init__.py +4 -0
  61. {hud_python-0.2.4 → hud_python-0.2.5}/hud/agent/base.py +18 -2
  62. {hud_python-0.2.4 → hud_python-0.2.5}/hud/agent/claude.py +20 -17
  63. hud_python-0.2.5/hud/agent/claude_plays_pokemon.py +282 -0
  64. {hud_python-0.2.4 → hud_python-0.2.5}/hud/agent/langchain.py +12 -7
  65. hud_python-0.2.5/hud/agent/misc/__init__.py +3 -0
  66. hud_python-0.2.5/hud/agent/misc/response_agent.py +80 -0
  67. {hud_python-0.2.4 → hud_python-0.2.5}/hud/agent/operator.py +27 -19
  68. hud_python-0.2.5/hud/agent/tests/__init__.py +1 -0
  69. hud_python-0.2.5/hud/agent/tests/test_base.py +202 -0
  70. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/docker_client.py +28 -18
  71. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/environment.py +32 -16
  72. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/local_docker_client.py +83 -42
  73. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/remote_client.py +1 -3
  74. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/remote_docker_client.py +72 -15
  75. {hud_python-0.2.4 → hud_python-0.2.5}/hud/exceptions.py +12 -0
  76. hud_python-0.2.5/hud/gym.py +128 -0
  77. {hud_python-0.2.4 → hud_python-0.2.5}/hud/job.py +52 -7
  78. {hud_python-0.2.4 → hud_python-0.2.5}/hud/settings.py +6 -0
  79. {hud_python-0.2.4 → hud_python-0.2.5}/hud/task.py +45 -33
  80. {hud_python-0.2.4 → hud_python-0.2.5}/hud/taskset.py +44 -4
  81. hud_python-0.2.5/hud/telemetry/__init__.py +21 -0
  82. hud_python-0.2.5/hud/telemetry/_trace.py +173 -0
  83. hud_python-0.2.5/hud/telemetry/context.py +193 -0
  84. hud_python-0.2.5/hud/telemetry/exporter.py +417 -0
  85. hud_python-0.2.5/hud/telemetry/instrumentation/__init__.py +3 -0
  86. hud_python-0.2.5/hud/telemetry/instrumentation/mcp.py +498 -0
  87. hud_python-0.2.5/hud/telemetry/instrumentation/registry.py +59 -0
  88. hud_python-0.2.5/hud/telemetry/mcp_models.py +331 -0
  89. hud_python-0.2.5/hud/telemetry/tests/__init__.py +1 -0
  90. hud_python-0.2.5/hud/telemetry/tests/test_context.py +203 -0
  91. hud_python-0.2.5/hud/telemetry/tests/test_trace.py +270 -0
  92. hud_python-0.2.5/hud/types.py +54 -0
  93. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/common.py +22 -2
  94. hud_python-0.2.5/hud/utils/misc.py +53 -0
  95. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/test_version.py +1 -1
  96. hud_python-0.2.5/hud/version.py +7 -0
  97. {hud_python-0.2.4 → hud_python-0.2.5}/pyproject.toml +25 -2
  98. hud_python-0.2.4/README.md +0 -136
  99. hud_python-0.2.4/docs/api/reference/adapters.mdx +0 -201
  100. hud_python-0.2.4/docs/docs.json +0 -81
  101. hud_python-0.2.4/docs/environments/hud-browser.mdx +0 -67
  102. hud_python-0.2.4/docs/examples/basic.mdx +0 -140
  103. hud_python-0.2.4/docs/examples/claude-agent.mdx +0 -98
  104. hud_python-0.2.4/docs/examples/custom-agent.mdx +0 -133
  105. hud_python-0.2.4/docs/installation.mdx +0 -59
  106. hud_python-0.2.4/docs/logo/HUD-light-optimized.svg +0 -5
  107. hud_python-0.2.4/docs/logo/HUD.svg +0 -5
  108. hud_python-0.2.4/docs/quickstart.mdx +0 -120
  109. hud_python-0.2.4/environments/novnc_ubuntu/Dockerfile +0 -1
  110. hud_python-0.2.4/environments/qa_controller/Dockerfile +0 -16
  111. hud_python-0.2.4/examples/WebVoyager_data.jsonl +0 -643
  112. hud_python-0.2.4/examples/ds_upload.ipynb +0 -2316
  113. hud_python-0.2.4/examples/inspect.ipynb +0 -2087
  114. hud_python-0.2.4/hud/__init__.py +0 -28
  115. hud_python-0.2.4/hud/gym.py +0 -110
  116. hud_python-0.2.4/hud/types.py +0 -70
  117. {hud_python-0.2.4 → hud_python-0.2.5}/.env.example +0 -0
  118. {hud_python-0.2.4 → hud_python-0.2.5}/.github/workflows/ci.yml +0 -0
  119. {hud_python-0.2.4 → hud_python-0.2.5}/.github/workflows/release.yml +0 -0
  120. {hud_python-0.2.4 → hud_python-0.2.5}/.gitignore +0 -0
  121. {hud_python-0.2.4 → hud_python-0.2.5}/LICENSE +0 -0
  122. {hud_python-0.2.4 → hud_python-0.2.5}/MANIFEST.in +0 -0
  123. {hud_python-0.2.4 → hud_python-0.2.5}/docs/advanced/cla-details.mdx +0 -0
  124. {hud_python-0.2.4 → hud_python-0.2.5}/docs/advanced/environment-control.mdx +0 -0
  125. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/adapters.mdx +0 -0
  126. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/env.mdx +0 -0
  127. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/gym.mdx +0 -0
  128. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/job.mdx +0 -0
  129. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/task.mdx +0 -0
  130. {hud_python-0.2.4 → hud_python-0.2.5}/docs/api-reference/trajectory.mdx +0 -0
  131. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/adapter.mdx +0 -0
  132. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/agent.mdx +0 -0
  133. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/environment.mdx +0 -0
  134. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/job.mdx +0 -0
  135. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/task.mdx +0 -0
  136. {hud_python-0.2.4 → hud_python-0.2.5}/docs/concepts/trajectory.mdx +0 -0
  137. {hud_python-0.2.4 → hud_python-0.2.5}/docs/favicon.png +0 -0
  138. {hud_python-0.2.4/environments/novnc_ubuntu/src/novnc_ubuntu → hud_python-0.2.5/environments/novnc_ubuntu/src/hud_controller}/__init__.py +0 -0
  139. {hud_python-0.2.4/environments/novnc_ubuntu/src/novnc_ubuntu → hud_python-0.2.5/environments/novnc_ubuntu/src/hud_controller}/pyautogui_rosetta.py +0 -0
  140. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/__init__.py +0 -0
  141. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/evaluate/__init__.py +0 -0
  142. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/setup/__init__.py +0 -0
  143. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/utils/__init__.py +0 -0
  144. {hud_python-0.2.4/environments/qa_controller/src/qa_controller → hud_python-0.2.5/environments/qa_controller/src/hud_controller}/utils/state.py +0 -0
  145. {hud_python-0.2.4 → hud_python-0.2.5}/examples/jobs.ipynb +0 -0
  146. {hud_python-0.2.4 → hud_python-0.2.5}/examples/osworld.ipynb +0 -0
  147. {hud_python-0.2.4 → hud_python-0.2.5}/examples/tasks.ipynb +0 -0
  148. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/__init__.py +0 -0
  149. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/claude/__init__.py +0 -0
  150. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/common/__init__.py +0 -0
  151. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/common/adapter.py +0 -0
  152. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/common/tests/__init__.py +0 -0
  153. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/common/tests/test_adapter.py +0 -0
  154. {hud_python-0.2.4 → hud_python-0.2.5}/hud/adapters/operator/__init__.py +0 -0
  155. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/__init__.py +0 -0
  156. {hud_python-0.2.4 → hud_python-0.2.5}/hud/env/client.py +0 -0
  157. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/__init__.py +0 -0
  158. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/base.py +0 -0
  159. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/inspect.py +0 -0
  160. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/judge.py +0 -0
  161. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/match.py +0 -0
  162. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/remote.py +0 -0
  163. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/tests/__init__.py +0 -0
  164. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/tests/test_inspect.py +0 -0
  165. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/tests/test_judge.py +0 -0
  166. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/tests/test_match.py +0 -0
  167. {hud_python-0.2.4 → hud_python-0.2.5}/hud/evaluators/tests/test_remote.py +0 -0
  168. {hud_python-0.2.4 → hud_python-0.2.5}/hud/py.typed +0 -0
  169. {hud_python-0.2.4 → hud_python-0.2.5}/hud/server/__init__.py +0 -0
  170. {hud_python-0.2.4 → hud_python-0.2.5}/hud/server/requests.py +0 -0
  171. {hud_python-0.2.4 → hud_python-0.2.5}/hud/server/tests/__init__.py +0 -0
  172. {hud_python-0.2.4 → hud_python-0.2.5}/hud/server/tests/test_requests.py +0 -0
  173. {hud_python-0.2.4 → hud_python-0.2.5}/hud/trajectory.py +0 -0
  174. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/__init__.py +0 -0
  175. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/config.py +0 -0
  176. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/progress.py +0 -0
  177. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/telemetry.py +0 -0
  178. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/__init__.py +0 -0
  179. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/test_common.py +0 -0
  180. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/test_config.py +0 -0
  181. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/test_progress.py +0 -0
  182. {hud_python-0.2.4 → hud_python-0.2.5}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -38,11 +38,13 @@ Classifier: Programming Language :: Python :: 3.13
38
38
  Requires-Python: <3.14,>=3.10
39
39
  Requires-Dist: aiodocker>=0.24.0
40
40
  Requires-Dist: anthropic
41
+ Requires-Dist: dotenv>=0.9.9
41
42
  Requires-Dist: httpx<1,>=0.23.0
42
43
  Requires-Dist: inspect-ai>=0.3.80
43
44
  Requires-Dist: ipykernel
44
45
  Requires-Dist: langchain
45
46
  Requires-Dist: langchain-openai
47
+ Requires-Dist: mcp
46
48
  Requires-Dist: numpy
47
49
  Requires-Dist: openai
48
50
  Requires-Dist: pillow>=11.1.0
@@ -50,6 +52,7 @@ Requires-Dist: pydantic-settings<3,>=2
50
52
  Requires-Dist: pydantic<3,>=2
51
53
  Requires-Dist: textdistance<5,>=4.5.0
52
54
  Requires-Dist: toml>=0.10.2
55
+ Requires-Dist: wrapt>=1.14.0
53
56
  Provides-Extra: dev
54
57
  Requires-Dist: anthropic; extra == 'dev'
55
58
  Requires-Dist: dotenv; extra == 'dev'
@@ -66,15 +69,68 @@ Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
66
69
  Requires-Dist: ruff==0.11.8; extra == 'dev'
67
70
  Description-Content-Type: text/markdown
68
71
 
69
- # HUD
72
+ <div align="left">
73
+ <img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
74
+ </div>
75
+
76
+ <h3>
77
+ Create, evaluate, and improve AI agents across web browsers, desktop environments, and custom scenarios.
78
+ </h3>
79
+
80
+ > ### 🚀 Are you a startup building agents?
81
+ >
82
+ > [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
83
+ >
84
+ > We're here to help with eval strategies, custom environments, or improving your agent architecture!
70
85
 
71
- A Python SDK for creating, evaluating, and benchmarking agent interactions with web browsers and OS environments.
72
86
 
73
87
  > **Early Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
74
88
 
75
89
  [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
76
90
 
77
- [📚 Documentation](https://documentation.hud.so) | [🏠 Homepage](https://hud.so)
91
+ ## What You Can Do
92
+
93
+ **Evaluate Existing Benchmarks**
94
+ ```python
95
+ from hud import load_taskset, run_job, ClaudeAgent
96
+
97
+ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
98
+ job = await run_job(ClaudeAgent, taskset, "my-evaluation")
99
+ ```
100
+
101
+ **Create Custom Tasks**
102
+ ```python
103
+ from hud.task import Task
104
+
105
+ task = Task(
106
+ prompt="Find and book the cheapest flight from NYC to Paris",
107
+ gym="hud-browser",
108
+ setup=("goto", "https://kayak.com"),
109
+ evaluate=("page_contains", "confirmation")
110
+ )
111
+ ```
112
+
113
+ **Build Custom Environments**
114
+ ```python
115
+ from hud.types import CustomGym
116
+
117
+ # Launch any website as an environment
118
+ custom_gym = CustomGym(
119
+ image_or_build_context="nginx:alpine",
120
+ location="local"
121
+ )
122
+
123
+ # Or create complex Docker environments - see environments/ folder for examples
124
+ ```
125
+
126
+ **Trace Tool Calls Alongside HUD Environments (or Independently)**
127
+ ```python
128
+ import hud
129
+
130
+ with hud.trace("my-agent-run"):
131
+ # Your agent code here - MCP calls automatically captured
132
+ result = await agent.run(task)
133
+ ```
78
134
 
79
135
  ## API Key Setup
80
136
 
@@ -119,7 +175,7 @@ async def main():
119
175
  # Create environment using the gym module
120
176
  env = await gym.make(task)
121
177
 
122
- # Initialize Operator agent (API key is loaded automatically)
178
+ # Initialize Claude agent (API key is loaded automatically)
123
179
  agent = ClaudeAgent()
124
180
 
125
181
  # Agent loop with predict and step functions
@@ -137,7 +193,6 @@ async def main():
137
193
 
138
194
  if __name__ == "__main__":
139
195
  asyncio.run(main())
140
-
141
196
  ```
142
197
 
143
198
  Alternatively, run a full evaluation set via the ```run_job``` command:
@@ -145,32 +200,45 @@ Alternatively, run a full evaluation set via the ```run_job``` command:
145
200
  ```python
146
201
  from hud import load_taskset, run_job, ClaudeAgent
147
202
 
148
- # load
203
+ # Load a benchmark
149
204
  taskset = load_taskset("GAIA")
150
205
 
151
- # evaluate
206
+ # Evaluate
152
207
  job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
153
208
 
154
- # get results OR view them in app.hud.so
209
+ # Get results OR view them in app.hud.so
155
210
  print(await job.get_analytics())
156
211
  ```
157
212
 
213
+ ## Ready-to-Use TaskSets
214
+
215
+ - **WebVoyager** - Web navigation and interaction
216
+ - **Mind2Web** - Complex web application tasks
217
+ - **GAIA** - Question answering and reasoning
218
+ - **OSWorld-Ubuntu** - Desktop interaction
219
+ - **hud-samples** - Getting started examples
220
+
221
+ ## Community
222
+
223
+ **Contributing Custom Environments**
224
+
225
+ Add your environment to the `environments/` folder and submit a PR! Examples:
226
+ - `environments/novnc_ubuntu/` - Ubuntu with VNC access
227
+ - `environments/pokemon_controller/` - Pokemon emulator environment (In Development)
228
+ - `environments/qa_controller/` - Lightweight app sample
229
+
230
+ See [Custom Environments Guide](https://docs.hud.so/environment-creation) for details.
231
+
158
232
  ## Documentation Sections
159
233
 
160
234
  Explore the core concepts and features of the SDK:
161
235
 
162
- * **[Tasks and TaskSets](https://documentation.hud.so/concepts/task)**: Define goals, context, setup, and evaluation criteria for agent scenarios. This includes both interactive and **question-answering (QA)** style tasks.
163
- * **[Environments](https://documentation.hud.so/concepts/environment)**: Understand the browser and OS runtimes where agents interact.
164
- * **[Agents](https://documentation.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
165
- * **[Adapters](https://documentation.hud.so/concepts/adapter)**: See how actions and observations are translated between agents and environments.
166
- * **[Jobs](https://documentation.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
167
- * **[Trajectories](https://documentation.hud.so/concepts/trajectory)**: Understand the recorded data from each agent run.
168
- * **Advanced Topics**:
169
- * **[CLA Action Details](https://documentation.hud.so/advanced/cla-details)**: Explore the standardized action format.
170
- * **[Custom Environments](https://documentation.hud.so/advanced/custom-environments)**: Build your own Docker-based local or remote environments.
171
- * **[Advanced Environment Control](https://documentation.hud.so/advanced/environment-control)**: Use `invoke`, `execute`, and `_setup` for finer control.
172
-
173
- * **[Full API Reference](https://documentation.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
236
+ * **[Task Creation](https://docs.hud.so/task-creation)**: Build custom evaluation scenarios with setup and evaluation criteria.
237
+ * **[Environments](https://docs.hud.so/environments/browser)**: Understand browser environments and create custom Docker-based environments.
238
+ * **[Agents](https://docs.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
239
+ * **[Jobs](https://docs.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
240
+ * **[MCP Telemetry](https://docs.hud.so/telemetry/mcp)**: Automatic tracing of Model Context Protocol interactions.
241
+ * **[Full API Reference](https://docs.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
174
242
 
175
243
  ## [Examples](examples/)
176
244
 
@@ -183,7 +251,7 @@ We recommend you first take a look at the example notebooks showing how to use t
183
251
 
184
252
  ## Documentation
185
253
 
186
- For comprehensive guides, examples, and API reference, visit [our docs](https://documentation.hud.so/introduction)
254
+ For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
187
255
 
188
256
  ## License
189
257
 
@@ -0,0 +1,201 @@
1
+ <div align="left">
2
+ <img src="https://raw.githubusercontent.com/hud-evals/hud-sdk/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 20px;"/>
3
+ </div>
4
+
5
+ <h3>
6
+ Create, evaluate, and improve AI agents across web browsers, desktop environments, and custom scenarios.
7
+ </h3>
8
+
9
+ > ### 🚀 Are you a startup building agents?
10
+ >
11
+ > [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
12
+ >
13
+ > We're here to help with eval strategies, custom environments, or improving your agent architecture!
14
+
15
+
16
+ > **Early Release Notice**: This SDK is currently in early release status. The API is evolving and may change in future releases as we gather feedback and improve functionality.
17
+
18
+ [![PyPI version](https://img.shields.io/pypi/v/hud-python)](https://pypi.org/project/hud-python/)
19
+
20
+ ## ✨ What You Can Do
21
+
22
+ **Evaluate Existing Benchmarks**
23
+ ```python
24
+ from hud import load_taskset, run_job, ClaudeAgent
25
+
26
+ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
27
+ job = await run_job(ClaudeAgent, taskset, "my-evaluation")
28
+ ```
29
+
30
+ **Create Custom Tasks**
31
+ ```python
32
+ from hud.task import Task
33
+
34
+ task = Task(
35
+ prompt="Find and book the cheapest flight from NYC to Paris",
36
+ gym="hud-browser",
37
+ setup=("goto", "https://kayak.com"),
38
+ evaluate=("page_contains", "confirmation")
39
+ )
40
+ ```
41
+
42
+ **Build Custom Environments**
43
+ ```python
44
+ from hud.types import CustomGym
45
+
46
+ # Launch any website as an environment
47
+ custom_gym = CustomGym(
48
+ image_or_build_context="nginx:alpine",
49
+ location="local"
50
+ )
51
+
52
+ # Or create complex Docker environments - see environments/ folder for examples
53
+ ```
54
+
55
+ **Trace Tool Calls Alongside HUD Environments (or Independently)**
56
+ ```python
57
+ import hud
58
+
59
+ with hud.trace("my-agent-run"):
60
+ # Your agent code here - MCP calls automatically captured
61
+ result = await agent.run(task)
62
+ ```
63
+
64
+ ## API Key Setup
65
+
66
+ Before getting started, you'll need to obtain an API key:
67
+
68
+ 1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
69
+ 2. Set it in your environment or .env file:
70
+
71
+ ```bash
72
+ export HUD_API_KEY=your_api_key_here
73
+ ```
74
+
75
+ ## Quick Start
76
+
77
+ ### Installation
78
+
79
+ ```bash
80
+ pip install hud-python
81
+ ```
82
+
83
+ ### Simple Browser Example with Claude Computer Use
84
+
85
+ > This example uses the `@register_job("test-run")` decorator, so the results of this run will appear under the job named "test-run" on the your [HUD Jobs page](https://app.hud.so/jobs).
86
+
87
+ Make sure your have defined your `ANTRHOPIC_API_KEY` in environment variables to run Claude.
88
+
89
+ ```python
90
+ import asyncio
91
+ from hud import gym, register_job
92
+ from hud.task import Task
93
+ from hud.agent import ClaudeAgent
94
+
95
+ @register_job("test-run")
96
+ async def main():
97
+ task = Task(
98
+ prompt="Insert the text 'capybara' into the search bar",
99
+ gym="hud-browser",
100
+ setup=("goto", "google.com"),
101
+ evaluate=("contains_text", "capybara")
102
+ )
103
+
104
+ # Create environment using the gym module
105
+ env = await gym.make(task)
106
+
107
+ # Initialize Claude agent (API key is loaded automatically)
108
+ agent = ClaudeAgent()
109
+
110
+ # Agent loop with predict and step functions
111
+ obs, _ = await env.reset() # Gets first observation
112
+ for i in range(5):
113
+ actions, done = await agent.predict(obs)
114
+
115
+ obs, reward, terminated, info = await env.step(actions)
116
+ if done or terminated: break
117
+
118
+ # Evaluate and close
119
+ result = await env.evaluate()
120
+ print(f"Evaluation result: {result}")
121
+ await env.close()
122
+
123
+ if __name__ == "__main__":
124
+ asyncio.run(main())
125
+ ```
126
+
127
+ Alternatively, run a full evaluation set via the ```run_job``` command:
128
+
129
+ ```python
130
+ from hud import load_taskset, run_job, ClaudeAgent
131
+
132
+ # Load a benchmark
133
+ taskset = load_taskset("GAIA")
134
+
135
+ # Evaluate
136
+ job = await run_job(ClaudeAgent, taskset, "test-gaia-job")
137
+
138
+ # Get results OR view them in app.hud.so
139
+ print(await job.get_analytics())
140
+ ```
141
+
142
+ ## Ready-to-Use TaskSets
143
+
144
+ - **WebVoyager** - Web navigation and interaction
145
+ - **Mind2Web** - Complex web application tasks
146
+ - **GAIA** - Question answering and reasoning
147
+ - **OSWorld-Ubuntu** - Desktop interaction
148
+ - **hud-samples** - Getting started examples
149
+
150
+ ## Community
151
+
152
+ **Contributing Custom Environments**
153
+
154
+ Add your environment to the `environments/` folder and submit a PR! Examples:
155
+ - `environments/novnc_ubuntu/` - Ubuntu with VNC access
156
+ - `environments/pokemon_controller/` - Pokemon emulator environment (In Development)
157
+ - `environments/qa_controller/` - Lightweight app sample
158
+
159
+ See [Custom Environments Guide](https://docs.hud.so/environment-creation) for details.
160
+
161
+ ## Documentation Sections
162
+
163
+ Explore the core concepts and features of the SDK:
164
+
165
+ * **[Task Creation](https://docs.hud.so/task-creation)**: Build custom evaluation scenarios with setup and evaluation criteria.
166
+ * **[Environments](https://docs.hud.so/environments/browser)**: Understand browser environments and create custom Docker-based environments.
167
+ * **[Agents](https://docs.hud.so/concepts/agent)**: Learn about the agent architecture (Claude, Operator) and how they process observations and predict actions.
168
+ * **[Jobs](https://docs.hud.so/concepts/job)**: Group related runs for analysis and viewing on the HUD platform.
169
+ * **[MCP Telemetry](https://docs.hud.so/telemetry/mcp)**: Automatic tracing of Model Context Protocol interactions.
170
+ * **[Full API Reference](https://docs.hud.so/api-reference/gym)**: Detailed specifications for all modules and classes.
171
+
172
+ ## [Examples](examples/)
173
+
174
+ We recommend you first take a look at the example notebooks showing how to use the HUD SDK:
175
+
176
+ 1. [Browser Basics](examples/browser_use.ipynb) - Simple browser interaction with live view
177
+ 2. [Task Design](examples/tasks.ipynb) - Creating and customizing tasks
178
+ 3. [OSWorld](examples/osworld.ipynb) - Running the OSWorld benchmark
179
+ 4. [Local Development](examples/local.ipynb) - Setting up local custom environments
180
+
181
+ ## Documentation
182
+
183
+ For comprehensive guides, examples, and API reference, visit [our docs](https://docs.hud.so/introduction)
184
+
185
+ ## License
186
+
187
+ [MIT License](LICENSE)
188
+
189
+ ## Citation
190
+
191
+ If you use this SDK in your research, please cite it as follows:
192
+
193
+ ```bibtex
194
+ @software{hud2025agentevalplatform,
195
+ author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
196
+ title = {{HUD: An Evaluation Platform for Agents}},
197
+ date = {2025-04},
198
+ url = {https://github.com/hud-evals/hud-sdk},
199
+ langid = {en}
200
+ }
201
+ ```
@@ -0,0 +1,129 @@
1
+ ---
2
+ title: 'Tracing'
3
+ description: 'Capture and analyze MCP calls with tracing functionality'
4
+ ---
5
+
6
+ # Tracing
7
+
8
+ The HUD SDK provides tracing functionality to capture and analyze MCP (Model-Client-Provider) calls during agent execution. This is particularly useful for debugging, performance analysis, and understanding the interaction between your agent and external services.
9
+
10
+ ## Overview
11
+
12
+ Tracing in HUD allows you to:
13
+
14
+ - Capture all MCP calls made by an agent within a specific code block or decorated function.
15
+ - Automatically upload these traces to the HUD platform ([app.hud.so](https://app.hud.so)) for detailed analysis.
16
+ - View comprehensive information about each call, including request/response payloads, timing, status, and any errors.
17
+ - Associate traces with specific task runs, jobs, and custom attributes for better organization and filtering.
18
+
19
+ This capability is essential whether your agent is interacting with HUD environments (like `hud-browser`) or using its own set of MCP-based tools independently.
20
+
21
+ ## Using Tracing
22
+
23
+ There are two main ways to use tracing in your code:
24
+
25
+ ### 1. Context Manager: `hud.trace()`
26
+
27
+ Use the `trace()` context manager to wrap a block of code where you want to capture MCP calls:
28
+
29
+ ```python
30
+ import hud
31
+ from mcp_use import MCPAgent, MCPClient
32
+ from langchain_openai import ChatOpenAI
33
+
34
+ # Create MCP components
35
+ client = MCPClient.from_dict({
36
+ "mcpServers": {
37
+ "example_server": {
38
+ "command": "npx",
39
+ "args": ["-y", "@example/mcp-server", "--option"],
40
+ "env": {
41
+ "OPTION_FLAG": "true"
42
+ }
43
+ }
44
+ }
45
+ })
46
+ llm = ChatOpenAI(model="gpt-4o")
47
+ agent = MCPAgent(llm=llm, client=client, max_steps=5)
48
+
49
+ # Wrap the agent execution with tracing
50
+ with hud.trace("my_mcp_trace", attributes={"query": "Find information about X"}):
51
+ result = await agent.run(
52
+ "Find information about X",
53
+ max_steps=5,
54
+ )
55
+
56
+ # Trace is automatically uploaded and available at https://app.hud.so/jobs/traces/[id]
57
+ ```
58
+
59
+ **Parameters:**
60
+
61
+ - `name` (str, optional): A name for this trace, useful for identification
62
+ - `attributes` (dict, optional): Additional metadata to associate with the trace
63
+
64
+ ### 2. Decorator: `@hud.register_trace`
65
+
66
+ Use the `@register_trace` decorator to automatically trace an entire function:
67
+
68
+ ```python
69
+ import hud
70
+ from mcp_use import MCPAgent, MCPClient
71
+ from langchain_openai import ChatOpenAI
72
+
73
+ @hud.register_trace(name="mcp_search_function", attributes={"type": "search"})
74
+ async def perform_search(query: str):
75
+ client = MCPClient.from_dict({
76
+ "mcpServers": {
77
+ "search_server": {
78
+ "command": "npx",
79
+ "args": ["-y", "@search/mcp-server"],
80
+ }
81
+ }
82
+ })
83
+ llm = ChatOpenAI(model="gpt-4o")
84
+ agent = MCPAgent(llm=llm, client=client, max_steps=5)
85
+
86
+ return await agent.run(query, max_steps=5)
87
+
88
+ # Call the function - tracing happens automatically
89
+ result = await perform_search("What is the capital of France?")
90
+ ```
91
+
92
+ **Parameters:**
93
+
94
+ - `name` (str, optional): A name for this trace, defaults to the function name
95
+ - `attributes` (dict, optional): Additional metadata to associate with the trace
96
+
97
+ ## Viewing Traces
98
+
99
+ After a trace is captured, it's automatically uploaded to the HUD platform. You'll see a log message with a URL where you can view the trace:
100
+
101
+ ```
102
+ [hud] View trace at https://app.hud.so/jobs/traces/[trace_id]
103
+ ```
104
+
105
+ The trace view shows:
106
+
107
+ - Timeline of all MCP calls
108
+ - Request and response payloads
109
+ - Timing information
110
+ - Error details (if any)
111
+
112
+ ## Best Practices
113
+
114
+ - **Use descriptive names**: Choose meaningful names for your traces to make them easier to identify
115
+ - **Add relevant attributes**: Include metadata that will help you filter and analyze traces later
116
+ - **Limit trace scope**: Trace specific sections of code rather than entire applications to keep traces focused
117
+ - **Clean up resources**: Traces are automatically uploaded when the context manager exits or the decorated function completes
118
+
119
+ ## Limitations
120
+
121
+ - Tracing only captures MCP calls, not other types of API calls or internal function calls
122
+ - Large traces with many calls may take longer to upload and display
123
+ - Trace data is temporarily stored in memory before being uploaded
124
+
125
+ ## Related Concepts
126
+
127
+ - [Job](/concepts/job): Jobs can contain multiple traces
128
+ - [Environment](/concepts/environment): Environments can be associated with traces
129
+ - [Task](/concepts/task): Tasks can be traced to analyze performance
@@ -0,0 +1,160 @@
1
+ ---
2
+ title: 'Uploading TaskSets'
3
+ description: 'Learn how to upload TaskSets and view them on the HUD platform'
4
+ ---
5
+
6
+ # Uploading TaskSets
7
+
8
+ TaskSets are collections of tasks that can be uploaded to the HUD platform for evaluation and sharing. This guide explains how to upload TaskSets and access them through the platform.
9
+
10
+ ## Creating and Uploading a TaskSet
11
+
12
+ You can create a TaskSet from a list of tasks and upload it to the platform:
13
+
14
+ ```python
15
+ from hud.task import Task
16
+ from hud.taskset import TaskSet
17
+
18
+ # Create tasks
19
+ tasks = [
20
+ Task(
21
+ prompt="Navigate to example.com and verify the login page is displayed",
22
+ gym="hud-browser",
23
+ setup=[
24
+ ("goto", "https://example.com/login")
25
+ ],
26
+ evaluate={
27
+ "function": "page_contains",
28
+ "args": "Login"
29
+ }
30
+ ),
31
+ Task(
32
+ prompt="What is the capital of France?",
33
+ gym="hud-browser",
34
+ evaluate={
35
+ "function": "response_includes",
36
+ "args": "Paris"
37
+ }
38
+ )
39
+ ]
40
+
41
+ # Create a TaskSet
42
+ taskset = TaskSet(tasks=tasks)
43
+
44
+ # Upload the TaskSet
45
+ taskset_id = await taskset.upload("my-taskset")
46
+ print(f"TaskSet uploaded with ID: {taskset_id}")
47
+ ```
48
+
49
+ ## TaskSet Parameters
50
+
51
+ When creating a TaskSet, you can specify:
52
+
53
+ - `name`: A descriptive name for your TaskSet
54
+ - `description`: Detailed description of what the TaskSet evaluates
55
+ - `tasks`: List of Task objects
56
+ - `metadata`: Optional dictionary of metadata about the TaskSet
57
+
58
+ ## Task Configuration
59
+
60
+ Each Task in a TaskSet can include:
61
+
62
+ - `prompt`: The instruction or question for the agent
63
+ - `gym`: The environment type (e.g., "hud-browser", "hud-ubuntu")
64
+ - `setup`: Optional list of setup actions to run before the agent starts
65
+ - `evaluate`: Function configuration to determine task success
66
+ - `id`: Optional unique identifier for the task
67
+
68
+ Common evaluation functions include:
69
+ - `page_contains`: Checks if specific text exists on the page
70
+ - `response_includes`: Verifies if the agent's final response contains expected text
71
+ - `cookies_exist`: Checks if a set of cookies are present
72
+
73
+ ## Viewing TaskSets on the Platform
74
+
75
+ After uploading, you can view and manage your TaskSets at [app.hud.so/evalsets](https://app.hud.so/evalsets). The platform provides:
76
+
77
+ - List of all your uploaded TaskSets
78
+ - Detailed view of individual tasks within each TaskSet
79
+ - Task prompts and evaluation criteria
80
+ - Evaluation results when agents are run against the TaskSet
81
+
82
+ ## Loading an Existing TaskSet
83
+
84
+ You can load a previously uploaded TaskSet using its name:
85
+
86
+ ```python
87
+ from hud.taskset import TaskSet
88
+
89
+ # Load an existing TaskSet
90
+ taskset = await TaskSet.load("taskset-name")
91
+
92
+ # Access tasks
93
+ for task in taskset.tasks:
94
+ print(f"Task ID: {task.id}")
95
+ print(f"Prompt: {task.prompt}")
96
+ print(f"Evaluation: {task.evaluate}")
97
+ ```
98
+
99
+ ## Best Practices
100
+
101
+ 1. **Task Organization**
102
+ - Give tasks clear, unique IDs
103
+ - Use descriptive prompts
104
+ - Group related tasks into themed TaskSets
105
+
106
+ 2. **Evaluation Design**
107
+ - Choose appropriate evaluation functions
108
+ - Provide clear success criteria
109
+ - Test evaluation logic before uploading
110
+
111
+ 3. **Documentation**
112
+ - Write clear task prompts
113
+ - Document expected agent behavior
114
+ - Include example solutions where appropriate
115
+
116
+ ## Running Evaluations
117
+
118
+ Once uploaded, you can run agents against your TaskSet:
119
+
120
+ ```python
121
+ from hud.job import run_job
122
+ from my_agent import MyAgent
123
+
124
+ # Run evaluation using the TaskSet
125
+ job = await run_job(
126
+ agent_cls=MyAgent,
127
+ task_or_taskset=taskset,
128
+ job_name="Evaluation Run"
129
+ )
130
+
131
+ # View results on app.hud.so/jobs/{job.id}
132
+ print(f"View results at: https://app.hud.so/jobs/{job.id}")
133
+ ```
134
+
135
+ ## Platform Features
136
+
137
+ The HUD platform ([app.hud.so](https://app.hud.so)) provides several features for working with TaskSets:
138
+
139
+ 1. **TaskSet Management**
140
+ - Browse all uploaded TaskSets
141
+ - View individual task details
142
+ - Filter and search tasks
143
+ - Track evaluation history
144
+
145
+ 2. **Analysis Tools**
146
+ - Compare agent performance
147
+ - View detailed task results
148
+ - Export evaluation data
149
+ - Share results with team members
150
+
151
+ 3. **Collaboration**
152
+ - Share TaskSets with team members
153
+ - Collaborate on task creation
154
+ - Track changes and versions
155
+
156
+ ## Related Topics
157
+
158
+ - [Task Creation](../concepts/task.mdx)
159
+ - [Running Evaluations](../running-your-agent.mdx)
160
+ - [Job Management](../concepts/job.mdx)