hud-python 0.2.7__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (169) hide show
  1. {hud_python-0.2.7 → hud_python-0.2.8}/PKG-INFO +1 -1
  2. {hud_python-0.2.7 → hud_python-0.2.8}/examples/sheetbench_direct_example.ipynb +20 -56
  3. hud_python-0.2.8/examples/sheets_bench_cua_example.ipynb +218 -0
  4. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/docker_client.py +2 -1
  5. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/test_version.py +1 -1
  6. {hud_python-0.2.7 → hud_python-0.2.8}/hud/version.py +1 -1
  7. {hud_python-0.2.7 → hud_python-0.2.8}/pyproject.toml +1 -1
  8. {hud_python-0.2.7 → hud_python-0.2.8}/.env.example +0 -0
  9. {hud_python-0.2.7 → hud_python-0.2.8}/.github/workflows/ci.yml +0 -0
  10. {hud_python-0.2.7 → hud_python-0.2.8}/.github/workflows/release.yml +0 -0
  11. {hud_python-0.2.7 → hud_python-0.2.8}/.gitignore +0 -0
  12. {hud_python-0.2.7 → hud_python-0.2.8}/LICENSE +0 -0
  13. {hud_python-0.2.7 → hud_python-0.2.8}/MANIFEST.in +0 -0
  14. {hud_python-0.2.7 → hud_python-0.2.8}/README.md +0 -0
  15. {hud_python-0.2.7 → hud_python-0.2.8}/docs/advanced/cla-details.mdx +0 -0
  16. {hud_python-0.2.7 → hud_python-0.2.8}/docs/advanced/environment-control.mdx +0 -0
  17. {hud_python-0.2.7 → hud_python-0.2.8}/docs/advanced/tracing.mdx +0 -0
  18. {hud_python-0.2.7 → hud_python-0.2.8}/docs/advanced/uploading.mdx +0 -0
  19. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/adapters.mdx +0 -0
  20. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/env.mdx +0 -0
  21. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/gym.mdx +0 -0
  22. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/job.mdx +0 -0
  23. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/task.mdx +0 -0
  24. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/taskset.mdx +0 -0
  25. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/telemetry.mdx +0 -0
  26. {hud_python-0.2.7 → hud_python-0.2.8}/docs/api-reference/trajectory.mdx +0 -0
  27. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/adapter.mdx +0 -0
  28. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/agent.mdx +0 -0
  29. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/environment.mdx +0 -0
  30. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/job.mdx +0 -0
  31. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/task.mdx +0 -0
  32. {hud_python-0.2.7 → hud_python-0.2.8}/docs/concepts/trajectory.mdx +0 -0
  33. {hud_python-0.2.7 → hud_python-0.2.8}/docs/docs.json +0 -0
  34. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environment-creation.mdx +0 -0
  35. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environments/browser.mdx +0 -0
  36. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environments/custom-environments.mdx +0 -0
  37. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environments/custom.mdx +0 -0
  38. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environments/osworld-ubuntu.mdx +0 -0
  39. {hud_python-0.2.7 → hud_python-0.2.8}/docs/environments/qa.mdx +0 -0
  40. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/alignment-evaluation.mdx +0 -0
  41. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/benchmarking-agents.mdx +0 -0
  42. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/custom-os-env.mdx +0 -0
  43. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/mcp-agent-tracing.mdx +0 -0
  44. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/web-app-testing.mdx +0 -0
  45. {hud_python-0.2.7 → hud_python-0.2.8}/docs/examples/web-mocks.mdx +0 -0
  46. {hud_python-0.2.7 → hud_python-0.2.8}/docs/favicon.png +0 -0
  47. {hud_python-0.2.7 → hud_python-0.2.8}/docs/logo/hud_logo.svg +0 -0
  48. {hud_python-0.2.7 → hud_python-0.2.8}/docs/logo/hud_logo_dark.svg +0 -0
  49. {hud_python-0.2.7 → hud_python-0.2.8}/docs/quickstart.mdx +0 -0
  50. {hud_python-0.2.7 → hud_python-0.2.8}/docs/running-your-agent.mdx +0 -0
  51. {hud_python-0.2.7 → hud_python-0.2.8}/docs/task-creation.mdx +0 -0
  52. {hud_python-0.2.7 → hud_python-0.2.8}/environments/novnc_ubuntu/Dockerfile +0 -0
  53. {hud_python-0.2.7 → hud_python-0.2.8}/environments/novnc_ubuntu/pyproject.toml +0 -0
  54. {hud_python-0.2.7 → hud_python-0.2.8}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
  55. {hud_python-0.2.7 → hud_python-0.2.8}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
  56. {hud_python-0.2.7 → hud_python-0.2.8}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
  57. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/Dockerfile +0 -0
  58. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/pyproject.toml +0 -0
  59. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
  60. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
  61. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
  62. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
  63. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
  64. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
  65. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
  66. {hud_python-0.2.7 → hud_python-0.2.8}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
  67. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/Dockerfile +0 -0
  68. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/pyproject.toml +0 -0
  69. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
  70. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
  71. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
  72. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/info.py +0 -0
  73. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
  74. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
  75. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/step.py +0 -0
  76. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
  77. {hud_python-0.2.7 → hud_python-0.2.8}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
  78. {hud_python-0.2.7 → hud_python-0.2.8}/examples/README.md +0 -0
  79. {hud_python-0.2.7 → hud_python-0.2.8}/examples/appflowy.ipynb +0 -0
  80. {hud_python-0.2.7 → hud_python-0.2.8}/examples/browser_use.ipynb +0 -0
  81. {hud_python-0.2.7 → hud_python-0.2.8}/examples/custom_task_example.ipynb +0 -0
  82. {hud_python-0.2.7 → hud_python-0.2.8}/examples/jobs.ipynb +0 -0
  83. {hud_python-0.2.7 → hud_python-0.2.8}/examples/local.ipynb +0 -0
  84. {hud_python-0.2.7 → hud_python-0.2.8}/examples/mcp_test.ipynb +0 -0
  85. {hud_python-0.2.7 → hud_python-0.2.8}/examples/osworld.ipynb +0 -0
  86. {hud_python-0.2.7 → hud_python-0.2.8}/examples/pokemon_local.ipynb +0 -0
  87. {hud_python-0.2.7 → hud_python-0.2.8}/examples/pokemon_remote.ipynb +0 -0
  88. {hud_python-0.2.7 → hud_python-0.2.8}/examples/remote.ipynb +0 -0
  89. {hud_python-0.2.7 → hud_python-0.2.8}/examples/sensitive_data.ipynb +0 -0
  90. {hud_python-0.2.7 → hud_python-0.2.8}/examples/tasks.ipynb +0 -0
  91. {hud_python-0.2.7 → hud_python-0.2.8}/examples/wordle_example.ipynb +0 -0
  92. {hud_python-0.2.7 → hud_python-0.2.8}/hud/__init__.py +0 -0
  93. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/__init__.py +0 -0
  94. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/claude/__init__.py +0 -0
  95. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/claude/adapter.py +0 -0
  96. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/claude/tests/__init__.py +0 -0
  97. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/claude/tests/test_adapter.py +0 -0
  98. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/common/__init__.py +0 -0
  99. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/common/adapter.py +0 -0
  100. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/common/tests/__init__.py +0 -0
  101. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/common/tests/test_adapter.py +0 -0
  102. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/common/types.py +0 -0
  103. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/operator/__init__.py +0 -0
  104. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/operator/adapter.py +0 -0
  105. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/operator/tests/__init__.py +0 -0
  106. {hud_python-0.2.7 → hud_python-0.2.8}/hud/adapters/operator/tests/test_adapter.py +0 -0
  107. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/__init__.py +0 -0
  108. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/base.py +0 -0
  109. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/claude.py +0 -0
  110. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/claude_plays_pokemon.py +0 -0
  111. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/langchain.py +0 -0
  112. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/misc/__init__.py +0 -0
  113. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/misc/response_agent.py +0 -0
  114. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/operator.py +0 -0
  115. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/tests/__init__.py +0 -0
  116. {hud_python-0.2.7 → hud_python-0.2.8}/hud/agent/tests/test_base.py +0 -0
  117. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/__init__.py +0 -0
  118. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/client.py +0 -0
  119. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/environment.py +0 -0
  120. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/local_docker_client.py +0 -0
  121. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/remote_client.py +0 -0
  122. {hud_python-0.2.7 → hud_python-0.2.8}/hud/env/remote_docker_client.py +0 -0
  123. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/__init__.py +0 -0
  124. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/base.py +0 -0
  125. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/inspect.py +0 -0
  126. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/judge.py +0 -0
  127. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/match.py +0 -0
  128. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/remote.py +0 -0
  129. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/tests/__init__.py +0 -0
  130. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/tests/test_inspect.py +0 -0
  131. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/tests/test_judge.py +0 -0
  132. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/tests/test_match.py +0 -0
  133. {hud_python-0.2.7 → hud_python-0.2.8}/hud/evaluators/tests/test_remote.py +0 -0
  134. {hud_python-0.2.7 → hud_python-0.2.8}/hud/exceptions.py +0 -0
  135. {hud_python-0.2.7 → hud_python-0.2.8}/hud/gym.py +0 -0
  136. {hud_python-0.2.7 → hud_python-0.2.8}/hud/job.py +0 -0
  137. {hud_python-0.2.7 → hud_python-0.2.8}/hud/py.typed +0 -0
  138. {hud_python-0.2.7 → hud_python-0.2.8}/hud/server/__init__.py +0 -0
  139. {hud_python-0.2.7 → hud_python-0.2.8}/hud/server/requests.py +0 -0
  140. {hud_python-0.2.7 → hud_python-0.2.8}/hud/server/tests/__init__.py +0 -0
  141. {hud_python-0.2.7 → hud_python-0.2.8}/hud/server/tests/test_requests.py +0 -0
  142. {hud_python-0.2.7 → hud_python-0.2.8}/hud/settings.py +0 -0
  143. {hud_python-0.2.7 → hud_python-0.2.8}/hud/task.py +0 -0
  144. {hud_python-0.2.7 → hud_python-0.2.8}/hud/taskset.py +0 -0
  145. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/__init__.py +0 -0
  146. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/_trace.py +0 -0
  147. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/context.py +0 -0
  148. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/exporter.py +0 -0
  149. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/instrumentation/__init__.py +0 -0
  150. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/instrumentation/mcp.py +0 -0
  151. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/instrumentation/registry.py +0 -0
  152. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/mcp_models.py +0 -0
  153. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/tests/__init__.py +0 -0
  154. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/tests/test_context.py +0 -0
  155. {hud_python-0.2.7 → hud_python-0.2.8}/hud/telemetry/tests/test_trace.py +0 -0
  156. {hud_python-0.2.7 → hud_python-0.2.8}/hud/trajectory.py +0 -0
  157. {hud_python-0.2.7 → hud_python-0.2.8}/hud/types.py +0 -0
  158. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/__init__.py +0 -0
  159. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/agent.py +0 -0
  160. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/common.py +0 -0
  161. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/config.py +0 -0
  162. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/misc.py +0 -0
  163. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/progress.py +0 -0
  164. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/telemetry.py +0 -0
  165. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/__init__.py +0 -0
  166. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/test_common.py +0 -0
  167. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/test_config.py +0 -0
  168. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/test_progress.py +0 -0
  169. {hud_python-0.2.7 → hud_python-0.2.8}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.7
3
+ Version: 0.2.8
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -14,6 +14,7 @@
14
14
  "export HUD_API_KEY=your_api_key_here\n",
15
15
  "pip install hud-python\n",
16
16
  "pip install pandas # optional for debugging\n",
17
+ "pip install openpyxl # also a requirement for pandas\n",
17
18
  "```"
18
19
  ]
19
20
  },
@@ -26,7 +27,7 @@
26
27
  },
27
28
  {
28
29
  "cell_type": "code",
29
- "execution_count": 70,
30
+ "execution_count": 2,
30
31
  "metadata": {},
31
32
  "outputs": [],
32
33
  "source": [
@@ -51,24 +52,9 @@
51
52
  },
52
53
  {
53
54
  "cell_type": "code",
54
- "execution_count": 72,
55
+ "execution_count": null,
55
56
  "metadata": {},
56
- "outputs": [
57
- {
58
- "name": "stdout",
59
- "output_type": "stream",
60
- "text": [
61
- "Prompt: Sheet REV_QTR lists quarterly revenue from Q1-2022 through Q1-2025. Compute the compound annual growth rate between those two points. Enter the result in ANSWER!A1.\n",
62
- "['REV_QTR']\n",
63
- " Quarter Revenue\n",
64
- "0 Q1-2022 4200000\n",
65
- "1 Q2-2022 4404653\n",
66
- "2 Q3-2022 4746171\n",
67
- "3 Q4-2022 5062265\n",
68
- "4 Q1-2023 5365661\n"
69
- ]
70
- }
71
- ],
57
+ "outputs": [],
72
58
  "source": [
73
59
  "# Load in the first task for testing\n",
74
60
  "test_task = taskset[1]\n",
@@ -93,20 +79,9 @@
93
79
  },
94
80
  {
95
81
  "cell_type": "code",
96
- "execution_count": 73,
82
+ "execution_count": null,
97
83
  "metadata": {},
98
- "outputs": [
99
- {
100
- "name": "stdout",
101
- "output_type": "stream",
102
- "text": [
103
- "['REV_QTR', 'ANSWER']\n",
104
- "Empty DataFrame\n",
105
- "Columns: [0.241955862]\n",
106
- "Index: []\n"
107
- ]
108
- }
109
- ],
84
+ "outputs": [],
110
85
  "source": [
111
86
  "###\n",
112
87
  "### Your agent loop goes here, using *prompt* and *input_xlsx_file*, returns *output_xlsx_file*\n",
@@ -129,15 +104,7 @@
129
104
  "cell_type": "code",
130
105
  "execution_count": null,
131
106
  "metadata": {},
132
- "outputs": [
133
- {
134
- "name": "stdout",
135
- "output_type": "stream",
136
- "text": [
137
- "Reward: 1.0\n"
138
- ]
139
- }
140
- ],
107
+ "outputs": [],
141
108
  "source": [
142
109
  "# Get the base64 encoded xlsx file\n",
143
110
  "base64_output_xlsx_file = base64.b64encode(output_xlsx_file).decode(\"utf-8\")\n",
@@ -174,7 +141,7 @@
174
141
  },
175
142
  {
176
143
  "cell_type": "code",
177
- "execution_count": 64,
144
+ "execution_count": 3,
178
145
  "metadata": {},
179
146
  "outputs": [],
180
147
  "source": [
@@ -185,11 +152,11 @@
185
152
  },
186
153
  {
187
154
  "cell_type": "code",
188
- "execution_count": 69,
155
+ "execution_count": 17,
189
156
  "metadata": {},
190
157
  "outputs": [],
191
158
  "source": [
192
- "async def run_single_task(task):\n",
159
+ "async def run_single_task(task, job=None):\n",
193
160
  " prompt = task.prompt\n",
194
161
  " input_xlsx_file = requests.get(task.setup.args[0]).content\n",
195
162
  "\n",
@@ -202,7 +169,7 @@
202
169
  " # Run evaluation\n",
203
170
  " task.setup = (\"sheets_from_bytes\", base64_output_xlsx_file)\n",
204
171
  " task.id = None\n",
205
- " env = await gym.make(task)\n",
172
+ " env = await gym.make(task, job=job)\n",
206
173
  " result = await env.evaluate()\n",
207
174
  " await env.close()\n",
208
175
  "\n",
@@ -211,25 +178,22 @@
211
178
  },
212
179
  {
213
180
  "cell_type": "code",
214
- "execution_count": 68,
181
+ "execution_count": null,
215
182
  "metadata": {},
216
- "outputs": [
217
- {
218
- "name": "stdout",
219
- "output_type": "stream",
220
- "text": [
221
- "Average reward: 0.0\n"
222
- ]
223
- }
224
- ],
183
+ "outputs": [],
225
184
  "source": [
226
185
  "# Loading and evaluating 50 tasks should take around 2 minutes, without the agent loop\n",
227
186
  "import asyncio\n",
228
187
  "\n",
188
+ "# Adds the job to the app.hud.so platform, optional\n",
189
+ "from hud import create_job\n",
190
+ "\n",
191
+ "# Run the taskset\n",
229
192
  "taskset = await load_taskset(\"SheetBench-50\", metadata={\"partial\": True})\n",
193
+ "job = await create_job(\"SheetBench-50-Excel-Agent\", evalset_id=taskset.id)\n",
230
194
  "\n",
231
- "task_jobs = [run_single_task(task) for task in taskset]\n",
232
- "rewards = await asyncio.gather(*task_jobs)\n",
195
+ "task_runs = [run_single_task(task, job) for task in taskset]\n",
196
+ "rewards = await asyncio.gather(*task_runs)\n",
233
197
  "\n",
234
198
  "print(f\"Average reward: {sum(rewards) / len(rewards)}\")"
235
199
  ]
@@ -0,0 +1,218 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "%load_ext autoreload\n",
10
+ "%autoreload 2\n",
11
+ "\n",
12
+ "# uv pip install -e \".[dev]\"\n",
13
+ "from hud import gym, load_taskset\n",
14
+ "from hud.agent import ClaudeAgent, OperatorAgent, LangchainAgent # Pick your agent"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 2,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Task prompt: Given an automobile was purchased in Feb 1, 2022 with an estimated life of 7 years at a cost of 45,000 and was disposed of in April 1, 2025 with no salvage value and monthly depreciation is calculted to the nearest cent, calculate the loss on disposal. Put your answer in the ANSWER tab in cell A1.\n"
27
+ ]
28
+ }
29
+ ],
30
+ "source": [
31
+ "# Loads the SheetBench-50 taskset and enable partial grading\n",
32
+ "taskset = await load_taskset(\"SheetBench-50\", metadata={\"partial\": True})\n",
33
+ "\n",
34
+ "task = taskset.tasks[6]\n",
35
+ "print(f\"Task prompt: {task.prompt}\")"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {},
42
+ "outputs": [
43
+ {
44
+ "name": "stderr",
45
+ "output_type": "stream",
46
+ "text": [
47
+ "[INFO] 2025-07-07 20:48:19,780 | hud.env.remote_env_client | Gym created, see how to use it at https://docs.hud.so/environments/browser\n",
48
+ "[INFO] 2025-07-07 20:48:55,219 | hud.environment | View the live trace at https://app.hud.so/trace/38091b1b-ab28-4801-abbe-22e14594606e\n"
49
+ ]
50
+ },
51
+ {
52
+ "data": {
53
+ "text/html": [
54
+ "\n",
55
+ " <div style=\"width: 960px; height: 540px; overflow: hidden;\">\n",
56
+ " <div style=\"transform: scale(0.5); transform-origin: top left;\">\n",
57
+ " <iframe src=\"https://live.anchorbrowser.io?sessionId=8c7d218e-5698-4316-be80-104e199dd238\" width=\"1920\" height=\"1080\" style=\"border: 1px solid #ddd;\">\n",
58
+ " </iframe>\n",
59
+ " </div>\n",
60
+ " </div>\n",
61
+ " "
62
+ ],
63
+ "text/plain": [
64
+ "<IPython.core.display.HTML object>"
65
+ ]
66
+ },
67
+ "metadata": {},
68
+ "output_type": "display_data"
69
+ },
70
+ {
71
+ "data": {
72
+ "text/plain": [
73
+ "'\\n <div style=\"width: 960px; height: 540px; overflow: hidden;\">\\n <div style=\"transform: scale(0.5); transform-origin: top left;\">\\n <iframe src=\"https://live.anchorbrowser.io?sessionId=8c7d218e-5698-4316-be80-104e199dd238\" width=\"1920\" height=\"1080\" style=\"border: 1px solid #ddd;\">\\n </iframe>\\n </div>\\n </div>\\n '"
74
+ ]
75
+ },
76
+ "execution_count": 3,
77
+ "metadata": {},
78
+ "output_type": "execute_result"
79
+ }
80
+ ],
81
+ "source": [
82
+ "# Provision the browser environment\n",
83
+ "env = await gym.make(task)\n",
84
+ "\n",
85
+ "# Stream the environment's VNC\n",
86
+ "await env.stream()"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": 4,
92
+ "metadata": {},
93
+ "outputs": [
94
+ {
95
+ "name": "stderr",
96
+ "output_type": "stream",
97
+ "text": [
98
+ "[INFO] 2025-07-07 20:48:55,459 | hud.environment | Running agent in environment...\n",
99
+ "[INFO] 2025-07-07 20:48:59,439 | hud.agent.base | Predicting action...\n",
100
+ "[INFO] 2025-07-07 20:49:04,728 | hud.agent.base | Raw action: [{'action': 'screenshot', 'reasoning': \"I'll solve this problem and put the answer in cell A1 of the ANSWER tab. First, I need to calculate the loss on disposal of the automobile.\\n\\nTo calculate the loss on disposal, I need to:\\n1. Calculate the monthly depreciation rate\\n2. Determine how many months the car was used\\n3. Calculate the accumulated depreciation\\n4. Determine the book value at disposal\\n5. Calculate the loss (book value at disposal - salvage value, which is $0)\\n\\nLet me take a screenshot first to see the current state of the spreadsheet.\", 'logs': {'id': 'msg_01LHnz279HWA6RKvw4tgistM', 'content': [{'citations': None, 'text': \"I'll solve this problem and put the answer in cell A1 of the ANSWER tab. First, I need to calculate the loss on disposal of the automobile.\\n\\nTo calculate the loss on disposal, I need to:\\n1. Calculate the monthly depreciation rate\\n2. Determine how many months the car was used\\n3. Calculate the accumulated depreciation\\n4. Determine the book value at disposal\\n5. Calculate the loss (book value at disposal - salvage value, which is $0)\\n\\nLet me take a screenshot first to see the current state of the spreadsheet.\", 'type': 'text'}, {'id': 'toolu_019K4Wwz5kVyzTkBXQyvVNRD', 'input': {'action': 'screenshot', 'reasoning': \"I'll solve this problem and put the answer in cell A1 of the ANSWER tab. First, I need to calculate the loss on disposal of the automobile.\\n\\nTo calculate the loss on disposal, I need to:\\n1. Calculate the monthly depreciation rate\\n2. Determine how many months the car was used\\n3. Calculate the accumulated depreciation\\n4. Determine the book value at disposal\\n5. Calculate the loss (book value at disposal - salvage value, which is $0)\\n\\nLet me take a screenshot first to see the current state of the spreadsheet.\"}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 2749, 'cache_read_input_tokens': 0, 'input_tokens': 4, 'output_tokens': 166, 'service_tier': 'standard'}}}]\n",
101
+ "[INFO] 2025-07-07 20:49:04,738 | hud.environment | Step 0: Action: 📸 Screenshot │ Reasoning: I'll solve this problem and put the answer in cell A1 of the ANSWER tab. First, I need to calculate the loss on disposal of the automobile.\n",
102
+ "\n",
103
+ "To calculate the loss on disposal, I need to:\n",
104
+ "1. Calculate the monthly depreciation rate\n",
105
+ "2. Determine how many months the car was used\n",
106
+ "3. Calculate the accumulated depreciation\n",
107
+ "4. Determine the book value at disposal\n",
108
+ "5. Calculate the loss (book value at disposal - salvage value, which is $0)\n",
109
+ "\n",
110
+ "Let me take a screenshot first to see the current state of the spreadsheet. │ Logs: {'id': 'msg_01LHnz279HWA6RKvw4tgistM', 'content': [{'citations': None, 'text': \"I'll solve this problem and put the answer in cell A1 of the ANSWER tab. First, I need to calculate the loss on disposal of the automobile.\\n\\nTo calculate the loss on disposal, I need to:\\n1. Calc...\n",
111
+ "[INFO] 2025-07-07 20:49:10,351 | hud.environment | Step 0: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XtcU/X/B/D32QabiAMVFBUV..., text=None)\n",
112
+ "[INFO] 2025-07-07 20:49:10,352 | hud.agent.base | Predicting action...\n",
113
+ "[INFO] 2025-07-07 20:49:16,702 | hud.agent.base | Raw action: [{'action': 'left_click', 'coordinate': [127, 752], 'reasoning': 'I need to switch to the ANSWER tab to input my calculation. Let me click on the tabs at the bottom of the spreadsheet to find the ANSWER tab.', 'logs': {'id': 'msg_01BB4JUsxZpyEKX9zHjNxhCn', 'content': [{'citations': None, 'text': 'I need to switch to the ANSWER tab to input my calculation. Let me click on the tabs at the bottom of the spreadsheet to find the ANSWER tab.', 'type': 'text'}, {'id': 'toolu_01LD9oYyPw954DiCJuS5mzA7', 'input': {'action': 'left_click', 'coordinate': [127, 752], 'reasoning': 'I need to switch to the ANSWER tab to input my calculation. Let me click on the tabs at the bottom of the spreadsheet to find the ANSWER tab.'}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1852, 'cache_read_input_tokens': 2749, 'input_tokens': 7, 'output_tokens': 102, 'service_tier': 'standard'}}}]\n",
114
+ "[INFO] 2025-07-07 20:49:16,715 | hud.environment | Step 1: Action: 💥 Click at (178, 881) │ Reasoning: I need to switch to the ANSWER tab to input my calculation. Let me click on the tabs at the bottom of the spreadsheet to find the ANSWER tab. │ Logs: {'id': 'msg_01BB4JUsxZpyEKX9zHjNxhCn', 'content': [{'citations': None, 'text': 'I need to switch to the ANSWER tab to input my calculation. Let me click on the tabs at the bottom of the spreadsheet to find the ANSWER tab.', 'type': 'text'}, {'id': 'toolu_01LD9oYyPw954DiCJuS5mz...\n",
115
+ "[INFO] 2025-07-07 20:49:21,616 | hud.environment | Step 1: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XtcU/X/B/D32QabiAMVFBUV..., text=None)\n",
116
+ "[INFO] 2025-07-07 20:49:21,617 | hud.agent.base | Predicting action...\n",
117
+ "[INFO] 2025-07-07 20:49:30,523 | hud.agent.base | Raw action: [{'action': 'left_click', 'coordinate': [43, 752], 'reasoning': 'I see that I\\'m in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I\\'ll rename it to ANSWER.', 'logs': {'id': 'msg_01XaV6sdsddiDD6tpwmdXGz6', 'content': [{'citations': None, 'text': 'I see that I\\'m in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I\\'ll rename it to ANSWER.', 'type': 'text'}, {'id': 'toolu_018kDm3CZDRuVrUGGYZEB3PC', 'input': {'action': 'left_click', 'coordinate': [43, 752], 'reasoning': 'I see that I\\'m in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I\\'ll rename it to ANSWER.', 'logs': {'id': 'msg_01QbshsjqJbNMHHHbuxP1vLx', 'content': [{'citations': None, 'text': 'I see that I\\'m in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I\\'ll rename it to ANSWER.', 'type': 'text'}, {'id': 'toolu_01LYB7IaMRBQHcgdLVcgXCSu', 'input': {'action': 'left_click', 'coordinate': [43, 752], 'reasoning': 'I\\'ll click on the \"+\" icon at the bottom left to add a new sheet, which I\\'ll rename as ANSWER.'}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 2053, 'cache_read_input_tokens': 4601, 'input_tokens': 7, 'output_tokens': 134, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1520, 'cache_read_input_tokens': 4601, 'input_tokens': 7, 'output_tokens': 450, 'service_tier': 'standard'}}}]\n",
118
+ "[INFO] 2025-07-07 20:49:30,535 | hud.environment | Step 2: Action: 💥 Click at (60, 881) │ Reasoning: I see that I'm in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I'll rename it to ANSWER. │ Logs: {'id': 'msg_01XaV6sdsddiDD6tpwmdXGz6', 'content': [{'citations': None, 'text': 'I see that I\\'m in the INPUT tab. Let me look for the ANSWER tab by clicking on the \"+\" icon to add a new sheet, and then I\\'ll rename it to ANSWER.', 'type': 'text'}, {'id': 'toolu_018kDm3CZDRuVrU...\n",
119
+ "[INFO] 2025-07-07 20:49:36,136 | hud.environment | Step 2: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XlcE9f2APAzmWwgBBBQVFww..., text=None)\n",
120
+ "[INFO] 2025-07-07 20:49:36,137 | hud.agent.base | Predicting action...\n",
121
+ "[INFO] 2025-07-07 20:49:44,240 | hud.agent.base | Raw action: [{'action': 'double_click', 'coordinate': [182, 752], 'reasoning': 'I see that a new sheet \"Sheet1\" has been created. I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".', 'logs': {'id': 'msg_01ERCNcchyNhwLAvtTzrStzB', 'content': [{'citations': None, 'text': 'I see that a new sheet \"Sheet1\" has been created. I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".', 'type': 'text'}, {'id': 'toolu_01Dj6VExd12CYLkebh6wwSGg', 'input': {'action': 'double_click', 'coordinate': [182, 752], 'reasoning': 'I see that a new sheet \"Sheet1\" has been created. I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".', 'logs': {'id': 'msg_01MLf4xyMX6wfP25eZGLsBFm', 'content': [{'citations': None, 'text': 'I see that a new sheet \"Sheet1\" has been created. I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".', 'type': 'text'}, {'id': 'toolu_01KXIYrnnZeOi5Jl1jNyGdlB', 'input': {'action': 'double_click', 'coordinate': [182, 752], 'reasoning': 'I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".'}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1534, 'cache_read_input_tokens': 6121, 'input_tokens': 7, 'output_tokens': 106, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1835, 'cache_read_input_tokens': 6121, 'input_tokens': 7, 'output_tokens': 419, 'service_tier': 'standard'}}}]\n",
122
+ "[INFO] 2025-07-07 20:49:44,253 | hud.environment | Step 3: Action: 💥 Click at (255, 881) │ Reasoning: I see that a new sheet \"Sheet1\" has been created. I'll double-click on \"Sheet1\" to rename it to \"ANSWER\". │ Logs: {'id': 'msg_01ERCNcchyNhwLAvtTzrStzB', 'content': [{'citations': None, 'text': 'I see that a new sheet \"Sheet1\" has been created. I\\'ll double-click on \"Sheet1\" to rename it to \"ANSWER\".', 'type': 'text'}, {'id': 'toolu_01Dj6VExd12CYLkebh6wwSGg', 'input': {'action': 'double_cl...\n",
123
+ "[INFO] 2025-07-07 20:49:49,083 | hud.environment | Step 3: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XtcU/X/B/D32RnbVBigoKh4..., text=None)\n",
124
+ "[INFO] 2025-07-07 20:49:49,084 | hud.agent.base | Predicting action...\n",
125
+ "[INFO] 2025-07-07 20:50:00,276 | hud.agent.base | Raw action: [{'action': 'type', 'text': 'ANSWER', 'reasoning': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'logs': {'id': 'msg_01JXXpgeRLigZx6q4GyS3iHv', 'content': [{'citations': None, 'text': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'type': 'text'}, {'id': 'toolu_017fKaZ1E4jQwvadzgYfLMNN', 'input': {'action': 'type', 'text': 'ANSWER', 'reasoning': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'logs': {'id': 'msg_01qUZytsFejMI5EL1hkqBYnY', 'content': [{'citations': None, 'text': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'type': 'text'}, {'id': 'toolu_01yPIJWmrPTqQiDZZD9w2jKr', 'input': {'action': 'type', 'text': 'ANSWER', 'reasoning': 'I\\'m typing \"ANSWER\" to rename the sheet.', 'logs': {'id': 'msg_01YvGY6ljNkVpHF8xO8q2jPq', 'content': [{'citations': None, 'text': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'type': 'text'}, {'id': 'toolu_01yySLJx0bRNYKw5b7bXJ9N2', 'input': {'action': 'type', 'text': 'ANSWER', 'reasoning': 'Now I\\'ll type \"ANSWER\" to rename the sheet.'}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1356, 'cache_read_input_tokens': 6540, 'input_tokens': 7, 'output_tokens': 94, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1694, 'cache_read_input_tokens': 6540, 'input_tokens': 7, 'output_tokens': 394, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1786, 'cache_read_input_tokens': 7956, 'input_tokens': 7, 'output_tokens': 627, 'service_tier': 'standard'}}}]\n",
126
+ "[INFO] 2025-07-07 20:50:00,296 | hud.environment | Step 4: Action: ✍️ Type: \"ANSWER\" │ Reasoning: I see that the tab is now editable. I'll type \"ANSWER\" to rename the sheet. │ Logs: {'id': 'msg_01JXXpgeRLigZx6q4GyS3iHv', 'content': [{'citations': None, 'text': 'I see that the tab is now editable. I\\'ll type \"ANSWER\" to rename the sheet.', 'type': 'text'}, {'id': 'toolu_017fKaZ1E4jQwvadzgYfLMNN', 'input': {'action': 'type', 'text': 'ANSWER', 'reasoning': '...\n",
127
+ "[INFO] 2025-07-07 20:50:05,624 | hud.environment | Step 4: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XtcU/X/B/D32RnbVBigoKh4..., text=None)\n",
128
+ "[INFO] 2025-07-07 20:50:05,625 | hud.agent.base | Predicting action...\n",
129
+ "[INFO] 2025-07-07 20:50:23,559 | hud.agent.base | Raw action: [{'action': 'left_click', 'coordinate': [68, 150], 'reasoning': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nNow I\\'ll click on cell A1 and enter this value.', 'logs': {'id': 'msg_017tNV5t3ctJoJLCuSupeh7P', 'content': [{'citations': None, 'text': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nNow I\\'ll click on cell A1 and enter this value.', 'type': 'text'}, {'id': 'toolu_01AmR4hMQUFgixsGTaJKCBbr', 'input': {'action': 'left_click', 'coordinate': [68, 150], 'reasoning': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nNow I\\'ll click on cell A1 and enter this value.', 'logs': {'id': 'msg_01rCJk2PPp2kQHrbkIvTTbIm', 'content': [{'citations': None, 'text': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nNow I\\'ll click on cell A1 and enter this value.', 'type': 'text'}, {'id': 'toolu_01Wdvnqpzl0LDrMJZ91E6O4l', 'input': {'action': 'left_click', 'coordinate': [68, 150], 'reasoning': \"I'll click on cell A1 to select it so I can enter the loss on disposal value.\", 'logs': {'id': 'msg_01dY7qw1XlJnMCVDQG3pzXHN', 'content': [{'citations': None, 'text': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nNow I\\'ll click on cell A1 and enter this value.', 'type': 'text'}, {'id': 'toolu_01NNcQJRZ0Ai2rQtJTmJcfHM', 'input': {'action': 'left_click', 'coordinate': [68, 150], 'reasoning': \"I'll click on cell A1 to enter the calculated loss on disposal value.\"}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 2159, 'cache_read_input_tokens': 8583, 'input_tokens': 7, 'output_tokens': 350, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1925, 'cache_read_input_tokens': 8583, 'input_tokens': 7, 'output_tokens': 624, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1965, 'cache_read_input_tokens': 9742, 'input_tokens': 7, 'output_tokens': 1298, 'service_tier': 'standard'}}}]\n",
130
+ "[INFO] 2025-07-07 20:50:23,571 | hud.environment | Step 5: Action: 💥 Click at (95, 175) │ Reasoning: Perfect! I've successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\n",
131
+ "\n",
132
+ "Let me calculate the loss on disposal:\n",
133
+ "\n",
134
+ "1. Initial cost: $45,000\n",
135
+ "2. Estimated life: 7 years = 84 months\n",
136
+ "3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\n",
137
+ "4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\n",
138
+ "5. Accumulated depreciation: $535.71 × 38 = $20,357.14\n",
139
+ "6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\n",
140
+ "7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\n",
141
+ "\n",
142
+ "Now I'll click on cell A1 and enter this value. │ Logs: {'id': 'msg_017tNV5t3ctJoJLCuSupeh7P', 'content': [{'citations': None, 'text': 'Perfect! I\\'ve successfully renamed the tab to \"ANSWER\". Now I need to click on cell A1 and enter the calculated loss on disposal.\\n\\nLet me calculate the loss on disposal:\\n\\n1. Initial cost: $45,...\n",
143
+ "[INFO] 2025-07-07 20:50:30,040 | hud.environment | Step 5: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XlcE9f2APAzmWwgBBBQVFww..., text=None)\n",
144
+ "[INFO] 2025-07-07 20:50:30,041 | hud.agent.base | Predicting action...\n",
145
+ "[INFO] 2025-07-07 20:50:43,737 | hud.agent.base | Raw action: [{'action': 'type', 'text': '24642.86', 'reasoning': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'logs': {'id': 'msg_01Hw2c689KsYbcf5cUMo7uXV', 'content': [{'citations': None, 'text': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'type': 'text'}, {'id': 'toolu_01QwPSRuq1BMxZX2Qvk1gAZc', 'input': {'action': 'type', 'text': '24642.86', 'reasoning': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'logs': {'id': 'msg_01e5jVNV0jFHKdRbm8k2O5m5', 'content': [{'citations': None, 'text': \"I'll now type the calculated loss on disposal value of $24,642.86 in cell A1.\", 'type': 'text'}, {'id': 'toolu_01p3oi8FgTWKaVSuM9TQiECH', 'input': {'action': 'type', 'text': '24642.86', 'reasoning': \"I'll now type the calculated loss on disposal value of $24,642.86 in cell A1.\", 'logs': {'id': 'msg_01Vkp22XLKyAIXVrLGJIoBtT', 'content': [{'citations': None, 'text': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'type': 'text'}, {'id': 'toolu_019xEuuG7gHf0v3S4mZZVTIf', 'input': {'action': 'type', 'text': '24642.86', 'reasoning': \"I'll type the calculated loss on disposal value of $24,642.86 in cell A1.\", 'logs': {'id': 'msg_01A6XHpxgI7BXzM2u6gNhQ1h', 'content': [{'citations': None, 'text': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'type': 'text'}, {'id': 'toolu_01h9J1gQx7XJeXkzX1rCQw6t', 'input': {'action': 'type', 'text': '24642.86', 'reasoning': \"Now I'll type the calculated loss on disposal value ($24,642.86) in cell A1.\"}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1541, 'cache_read_input_tokens': 11040, 'input_tokens': 7, 'output_tokens': 95, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1890, 'cache_read_input_tokens': 11040, 'input_tokens': 7, 'output_tokens': 422, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1876, 'cache_read_input_tokens': 12357, 'input_tokens': 7, 'output_tokens': 680, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 3283, 'cache_read_input_tokens': 11707, 'input_tokens': 7, 'output_tokens': 913, 'service_tier': 'standard'}}}]\n",
146
+ "[INFO] 2025-07-07 20:50:43,749 | hud.environment | Step 6: Action: ✍️ Type: \"24642.86\" │ Reasoning: Perfect! Now I'll type the loss on disposal value that I calculated. │ Logs: {'id': 'msg_01Hw2c689KsYbcf5cUMo7uXV', 'content': [{'citations': None, 'text': \"Perfect! Now I'll type the loss on disposal value that I calculated.\", 'type': 'text'}, {'id': 'toolu_01QwPSRuq1BMxZX2Qvk1gAZc', 'input': {'action': 'type', 'text': '24642.86', 'reasoning': \"Perfec...\n",
147
+ "[INFO] 2025-07-07 20:50:50,650 | hud.environment | Step 6: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XlcE9f2APAzmWxsYVcUFBEV..., text=None)\n",
148
+ "[INFO] 2025-07-07 20:50:50,656 | hud.agent.base | Predicting action...\n",
149
+ "[INFO] 2025-07-07 20:51:08,678 | hud.agent.base | Raw action: [{'action': 'key', 'text': 'Return', 'reasoning': \"Now I'll press Enter to confirm the value in the cell.\", 'logs': {'id': 'msg_01KG88N2iR3R8xpQ4RvdLXAL', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01S8yX6pS7z8QyHggMVZHbik', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"Now I'll press Enter to confirm the value in the cell.\", 'logs': {'id': 'msg_014UvKZoxe05NWQpnA4WGc5g', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01fhkP7QwHJ4kJSVrGjE2cPg', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"Now I'll press Enter to confirm the value in the cell.\", 'logs': {'id': 'msg_01Af2GsNF3A7u5QTBRXq7oJi', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01mTmtrSvDj1Ol1XMVw9IrTz', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"I'll press Enter to confirm the value in the cell.\", 'logs': {'id': 'msg_01uZoQJCXZRb1znZ66hcjKAT', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01aEdDgYxlfuTnYQy5Vdztf0', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"Now I'll press Enter to confirm the value in the cell.\", 'logs': {'id': 'msg_01P4C96uLvTYzYVAaFLQkAKv', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01k9UGPmUC8fVwXTjgvGCJeE', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"I'll press Enter to confirm the value and complete the entry.\"}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1504, 'cache_read_input_tokens': 13037, 'input_tokens': 7, 'output_tokens': 93, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1851, 'cache_read_input_tokens': 13037, 'input_tokens': 7, 'output_tokens': 423, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 1852, 'cache_read_input_tokens': 14284, 'input_tokens': 7, 'output_tokens': 679, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 3271, 'cache_read_input_tokens': 14284, 'input_tokens': 7, 'output_tokens': 938, 'service_tier': 'standard'}}}, 'name': 'computer', 'type': 'tool_use'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'tool_use', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 2225, 'cache_read_input_tokens': 14990, 'input_tokens': 7, 'output_tokens': 1084, 'service_tier': 'standard'}}}]\n",
150
+ "[INFO] 2025-07-07 20:51:08,691 | hud.environment | Step 7: Action: 🎹 Press keys: enter │ Reasoning: Now I'll press Enter to confirm the value in the cell. │ Logs: {'id': 'msg_01KG88N2iR3R8xpQ4RvdLXAL', 'content': [{'citations': None, 'text': \"Now I'll press Enter to confirm the value in the cell.\", 'type': 'text'}, {'id': 'toolu_01S8yX6pS7z8QyHggMVZHbik', 'input': {'action': 'key', 'text': 'Return', 'reasoning': \"Now I'll press Enter to...\n",
151
+ "[INFO] 2025-07-07 20:51:13,431 | hud.environment | Step 7: Observation: Observation(screenshot=iVBORw0KGgoAAAANSUhEUgAABaAAAAOECAIAAADYLwGFAAAAAXNSR0IArs4c6QAAIABJREFUeJzs3XlcE9f2APAzmWwgBBBQVFww..., text=None)\n",
152
+ "[INFO] 2025-07-07 20:51:13,432 | hud.agent.base | Predicting action...\n",
153
+ "[INFO] 2025-07-07 20:51:18,479 | hud.agent.base | Raw action: [{'action': 'response', 'text': \"Perfect! I have successfully completed the task. I've calculated the loss on disposal of the automobile and entered the value in cell A1 of the ANSWER tab.\\n\\nHere's a summary of my calculation:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nThe answer of $24,642.86 is now displayed in cell A1 of the ANSWER tab, as requested.\", 'reasoning': \"Perfect! I have successfully completed the task. I've calculated the loss on disposal of the automobile and entered the value in cell A1 of the ANSWER tab.\\n\\nHere's a summary of my calculation:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nThe answer of $24,642.86 is now displayed in cell A1 of the ANSWER tab, as requested.\", 'logs': {'id': 'msg_01N5inFmfGSDtHWXjiZ1zDWS', 'content': [{'citations': None, 'text': \"Perfect! I have successfully completed the task. I've calculated the loss on disposal of the automobile and entered the value in cell A1 of the ANSWER tab.\\n\\nHere's a summary of my calculation:\\n\\n1. Initial cost: $45,000\\n2. Estimated life: 7 years = 84 months\\n3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\\n4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\\n5. Accumulated depreciation: $535.71 × 38 = $20,357.14\\n6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\\n7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\\n\\nThe answer of $24,642.86 is now displayed in cell A1 of the ANSWER tab, as requested.\", 'type': 'text'}], 'model': 'claude-3-7-sonnet-20250219', 'role': 'assistant', 'stop_reason': 'end_turn', 'stop_sequence': None, 'type': 'message', 'usage': {'cache_creation_input_tokens': 2397, 'cache_read_input_tokens': 17215, 'input_tokens': 7, 'output_tokens': 242, 'service_tier': 'standard'}}}]\n",
154
+ "[INFO] 2025-07-07 20:51:18,489 | hud.environment | Step 8: Action: 💬 Response: \"Perfect! I have successfully completed the task...\" │ Reasoning: Perfect! I have successfully completed the task. I've calculated the loss on disposal of the automobile and entered the value in cell A1 of the ANSWER tab.\n",
155
+ "\n",
156
+ "Here's a summary of my calculation:\n",
157
+ "\n",
158
+ "1. Initial cost: $45,000\n",
159
+ "2. Estimated life: 7 years = 84 months\n",
160
+ "3. Monthly depreciation: $45,000 ÷ 84 = $535.71 per month\n",
161
+ "4. Period of use: Feb 1, 2022 to April 1, 2025 = 38 months\n",
162
+ "5. Accumulated depreciation: $535.71 × 38 = $20,357.14\n",
163
+ "6. Book value at disposal: $45,000 - $20,357.14 = $24,642.86\n",
164
+ "7. Loss on disposal: Book value - Salvage value = $24,642.86 - $0 = $24,642.86\n",
165
+ "\n",
166
+ "The answer of $24,642.86 is now displayed in cell A1 of the ANSWER tab, as requested. │ Logs: {'id': 'msg_01N5inFmfGSDtHWXjiZ1zDWS', 'content': [{'citations': None, 'text': \"Perfect! I have successfully completed the task. I've calculated the loss on disposal of the automobile and entered the value in cell A1 of the ANSWER tab.\\n\\nHere's a summary of my calculation:\\n\\...\n",
167
+ "[INFO] 2025-07-07 20:51:19,665 | hud.environment | Step 8: Observation: Observation(screenshot=None, text=None)\n",
168
+ "[INFO] 2025-07-07 20:51:30,847 | hud.environment | Evaluation result: {'logs': None, 'error': None, 'reward': 0.0}\n"
169
+ ]
170
+ },
171
+ {
172
+ "data": {
173
+ "text/plain": [
174
+ "{'logs': None, 'error': None, 'reward': 0.0}"
175
+ ]
176
+ },
177
+ "execution_count": 4,
178
+ "metadata": {},
179
+ "output_type": "execute_result"
180
+ }
181
+ ],
182
+ "source": [
183
+ "# Make sure to define your ANTHROPIC_API_KEY in your .env file\n",
184
+ "await env.run(ClaudeAgent(), max_steps=100)"
185
+ ]
186
+ },
187
+ {
188
+ "cell_type": "code",
189
+ "execution_count": 5,
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "await env.close()"
194
+ ]
195
+ }
196
+ ],
197
+ "metadata": {
198
+ "kernelspec": {
199
+ "display_name": ".venv",
200
+ "language": "python",
201
+ "name": "python3"
202
+ },
203
+ "language_info": {
204
+ "codemirror_mode": {
205
+ "name": "ipython",
206
+ "version": 3
207
+ },
208
+ "file_extension": ".py",
209
+ "mimetype": "text/x-python",
210
+ "name": "python",
211
+ "nbconvert_exporter": "python",
212
+ "pygments_lexer": "ipython3",
213
+ "version": "3.10.16"
214
+ }
215
+ },
216
+ "nbformat": 4,
217
+ "nbformat_minor": 2
218
+ }
@@ -309,7 +309,8 @@ class DockerClient(Client):
309
309
  if len(stdout_parts) > 1:
310
310
  result = json.loads(stdout_parts[1])
311
311
  else:
312
- raise InvokeError(stdout, stderr)
312
+ logger.warning("Potential error: %s", stderr)
313
+ result = None
313
314
 
314
315
  return result, stdout, stderr
315
316
 
@@ -5,4 +5,4 @@ def test_import():
5
5
  """Test that the package can be imported."""
6
6
  import hud
7
7
 
8
- assert hud.__version__ == "0.2.7"
8
+ assert hud.__version__ == "0.2.8"
@@ -4,4 +4,4 @@ Version information for the HUD SDK.
4
4
 
5
5
  from __future__ import annotations
6
6
 
7
- __version__ = "0.2.7"
7
+ __version__ = "0.2.8"
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "hud-python"
3
- version = "0.2.7"
3
+ version = "0.2.8"
4
4
  description = "SDK for the HUD evaluation platform."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10, <3.14"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes