hud-python 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (169) hide show
  1. {hud_python-0.2.7 → hud_python-0.2.9}/PKG-INFO +1 -1
  2. {hud_python-0.2.7 → hud_python-0.2.9}/examples/sheetbench_direct_example.ipynb +20 -56
  3. hud_python-0.2.9/examples/sheets_bench_cua_example.ipynb +360 -0
  4. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/types.py +2 -1
  5. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/adapter.py +4 -0
  6. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/operator.py +22 -3
  7. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/docker_client.py +2 -1
  8. {hud_python-0.2.7 → hud_python-0.2.9}/hud/job.py +4 -4
  9. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_version.py +1 -1
  10. {hud_python-0.2.7 → hud_python-0.2.9}/hud/version.py +1 -1
  11. {hud_python-0.2.7 → hud_python-0.2.9}/pyproject.toml +1 -1
  12. {hud_python-0.2.7 → hud_python-0.2.9}/.env.example +0 -0
  13. {hud_python-0.2.7 → hud_python-0.2.9}/.github/workflows/ci.yml +0 -0
  14. {hud_python-0.2.7 → hud_python-0.2.9}/.github/workflows/release.yml +0 -0
  15. {hud_python-0.2.7 → hud_python-0.2.9}/.gitignore +0 -0
  16. {hud_python-0.2.7 → hud_python-0.2.9}/LICENSE +0 -0
  17. {hud_python-0.2.7 → hud_python-0.2.9}/MANIFEST.in +0 -0
  18. {hud_python-0.2.7 → hud_python-0.2.9}/README.md +0 -0
  19. {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/cla-details.mdx +0 -0
  20. {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/environment-control.mdx +0 -0
  21. {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/tracing.mdx +0 -0
  22. {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/uploading.mdx +0 -0
  23. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/adapters.mdx +0 -0
  24. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/env.mdx +0 -0
  25. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/gym.mdx +0 -0
  26. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/job.mdx +0 -0
  27. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/task.mdx +0 -0
  28. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/taskset.mdx +0 -0
  29. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/telemetry.mdx +0 -0
  30. {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/trajectory.mdx +0 -0
  31. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/adapter.mdx +0 -0
  32. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/agent.mdx +0 -0
  33. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/environment.mdx +0 -0
  34. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/job.mdx +0 -0
  35. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/task.mdx +0 -0
  36. {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/trajectory.mdx +0 -0
  37. {hud_python-0.2.7 → hud_python-0.2.9}/docs/docs.json +0 -0
  38. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environment-creation.mdx +0 -0
  39. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/browser.mdx +0 -0
  40. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/custom-environments.mdx +0 -0
  41. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/custom.mdx +0 -0
  42. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/osworld-ubuntu.mdx +0 -0
  43. {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/qa.mdx +0 -0
  44. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/alignment-evaluation.mdx +0 -0
  45. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/benchmarking-agents.mdx +0 -0
  46. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/custom-os-env.mdx +0 -0
  47. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/mcp-agent-tracing.mdx +0 -0
  48. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/web-app-testing.mdx +0 -0
  49. {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/web-mocks.mdx +0 -0
  50. {hud_python-0.2.7 → hud_python-0.2.9}/docs/favicon.png +0 -0
  51. {hud_python-0.2.7 → hud_python-0.2.9}/docs/logo/hud_logo.svg +0 -0
  52. {hud_python-0.2.7 → hud_python-0.2.9}/docs/logo/hud_logo_dark.svg +0 -0
  53. {hud_python-0.2.7 → hud_python-0.2.9}/docs/quickstart.mdx +0 -0
  54. {hud_python-0.2.7 → hud_python-0.2.9}/docs/running-your-agent.mdx +0 -0
  55. {hud_python-0.2.7 → hud_python-0.2.9}/docs/task-creation.mdx +0 -0
  56. {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/Dockerfile +0 -0
  57. {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/pyproject.toml +0 -0
  58. {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
  59. {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
  60. {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
  61. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/Dockerfile +0 -0
  62. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/pyproject.toml +0 -0
  63. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
  64. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
  65. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
  66. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
  67. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
  68. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
  69. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
  70. {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
  71. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/Dockerfile +0 -0
  72. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/pyproject.toml +0 -0
  73. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
  74. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
  75. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
  76. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/info.py +0 -0
  77. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
  78. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
  79. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/step.py +0 -0
  80. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
  81. {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
  82. {hud_python-0.2.7 → hud_python-0.2.9}/examples/README.md +0 -0
  83. {hud_python-0.2.7 → hud_python-0.2.9}/examples/appflowy.ipynb +0 -0
  84. {hud_python-0.2.7 → hud_python-0.2.9}/examples/browser_use.ipynb +0 -0
  85. {hud_python-0.2.7 → hud_python-0.2.9}/examples/custom_task_example.ipynb +0 -0
  86. {hud_python-0.2.7 → hud_python-0.2.9}/examples/jobs.ipynb +0 -0
  87. {hud_python-0.2.7 → hud_python-0.2.9}/examples/local.ipynb +0 -0
  88. {hud_python-0.2.7 → hud_python-0.2.9}/examples/mcp_test.ipynb +0 -0
  89. {hud_python-0.2.7 → hud_python-0.2.9}/examples/osworld.ipynb +0 -0
  90. {hud_python-0.2.7 → hud_python-0.2.9}/examples/pokemon_local.ipynb +0 -0
  91. {hud_python-0.2.7 → hud_python-0.2.9}/examples/pokemon_remote.ipynb +0 -0
  92. {hud_python-0.2.7 → hud_python-0.2.9}/examples/remote.ipynb +0 -0
  93. {hud_python-0.2.7 → hud_python-0.2.9}/examples/sensitive_data.ipynb +0 -0
  94. {hud_python-0.2.7 → hud_python-0.2.9}/examples/tasks.ipynb +0 -0
  95. {hud_python-0.2.7 → hud_python-0.2.9}/examples/wordle_example.ipynb +0 -0
  96. {hud_python-0.2.7 → hud_python-0.2.9}/hud/__init__.py +0 -0
  97. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/__init__.py +0 -0
  98. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/__init__.py +0 -0
  99. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/adapter.py +0 -0
  100. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/tests/__init__.py +0 -0
  101. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/tests/test_adapter.py +0 -0
  102. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/__init__.py +0 -0
  103. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/adapter.py +0 -0
  104. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/tests/__init__.py +0 -0
  105. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/tests/test_adapter.py +0 -0
  106. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/__init__.py +0 -0
  107. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/tests/__init__.py +0 -0
  108. {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/tests/test_adapter.py +0 -0
  109. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/__init__.py +0 -0
  110. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/base.py +0 -0
  111. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/claude.py +0 -0
  112. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/claude_plays_pokemon.py +0 -0
  113. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/langchain.py +0 -0
  114. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/misc/__init__.py +0 -0
  115. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/misc/response_agent.py +0 -0
  116. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/tests/__init__.py +0 -0
  117. {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/tests/test_base.py +0 -0
  118. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/__init__.py +0 -0
  119. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/client.py +0 -0
  120. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/environment.py +0 -0
  121. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/local_docker_client.py +0 -0
  122. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/remote_client.py +0 -0
  123. {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/remote_docker_client.py +0 -0
  124. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/__init__.py +0 -0
  125. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/base.py +0 -0
  126. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/inspect.py +0 -0
  127. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/judge.py +0 -0
  128. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/match.py +0 -0
  129. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/remote.py +0 -0
  130. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/__init__.py +0 -0
  131. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_inspect.py +0 -0
  132. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_judge.py +0 -0
  133. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_match.py +0 -0
  134. {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_remote.py +0 -0
  135. {hud_python-0.2.7 → hud_python-0.2.9}/hud/exceptions.py +0 -0
  136. {hud_python-0.2.7 → hud_python-0.2.9}/hud/gym.py +0 -0
  137. {hud_python-0.2.7 → hud_python-0.2.9}/hud/py.typed +0 -0
  138. {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/__init__.py +0 -0
  139. {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/requests.py +0 -0
  140. {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/tests/__init__.py +0 -0
  141. {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/tests/test_requests.py +0 -0
  142. {hud_python-0.2.7 → hud_python-0.2.9}/hud/settings.py +0 -0
  143. {hud_python-0.2.7 → hud_python-0.2.9}/hud/task.py +0 -0
  144. {hud_python-0.2.7 → hud_python-0.2.9}/hud/taskset.py +0 -0
  145. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/__init__.py +0 -0
  146. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/_trace.py +0 -0
  147. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/context.py +0 -0
  148. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/exporter.py +0 -0
  149. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/__init__.py +0 -0
  150. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/mcp.py +0 -0
  151. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/registry.py +0 -0
  152. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/mcp_models.py +0 -0
  153. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/__init__.py +0 -0
  154. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/test_context.py +0 -0
  155. {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/test_trace.py +0 -0
  156. {hud_python-0.2.7 → hud_python-0.2.9}/hud/trajectory.py +0 -0
  157. {hud_python-0.2.7 → hud_python-0.2.9}/hud/types.py +0 -0
  158. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/__init__.py +0 -0
  159. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/agent.py +0 -0
  160. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/common.py +0 -0
  161. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/config.py +0 -0
  162. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/misc.py +0 -0
  163. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/progress.py +0 -0
  164. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/telemetry.py +0 -0
  165. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/__init__.py +0 -0
  166. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_common.py +0 -0
  167. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_config.py +0 -0
  168. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_progress.py +0 -0
  169. {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -14,6 +14,7 @@
14
14
  "export HUD_API_KEY=your_api_key_here\n",
15
15
  "pip install hud-python\n",
16
16
  "pip install pandas # optional for debugging\n",
17
+ "pip install openpyxl # also a requirement for pandas\n",
17
18
  "```"
18
19
  ]
19
20
  },
@@ -26,7 +27,7 @@
26
27
  },
27
28
  {
28
29
  "cell_type": "code",
29
- "execution_count": 70,
30
+ "execution_count": 2,
30
31
  "metadata": {},
31
32
  "outputs": [],
32
33
  "source": [
@@ -51,24 +52,9 @@
51
52
  },
52
53
  {
53
54
  "cell_type": "code",
54
- "execution_count": 72,
55
+ "execution_count": null,
55
56
  "metadata": {},
56
- "outputs": [
57
- {
58
- "name": "stdout",
59
- "output_type": "stream",
60
- "text": [
61
- "Prompt: Sheet REV_QTR lists quarterly revenue from Q1-2022 through Q1-2025. Compute the compound annual growth rate between those two points. Enter the result in ANSWER!A1.\n",
62
- "['REV_QTR']\n",
63
- " Quarter Revenue\n",
64
- "0 Q1-2022 4200000\n",
65
- "1 Q2-2022 4404653\n",
66
- "2 Q3-2022 4746171\n",
67
- "3 Q4-2022 5062265\n",
68
- "4 Q1-2023 5365661\n"
69
- ]
70
- }
71
- ],
57
+ "outputs": [],
72
58
  "source": [
73
59
  "# Load in the first task for testing\n",
74
60
  "test_task = taskset[1]\n",
@@ -93,20 +79,9 @@
93
79
  },
94
80
  {
95
81
  "cell_type": "code",
96
- "execution_count": 73,
82
+ "execution_count": null,
97
83
  "metadata": {},
98
- "outputs": [
99
- {
100
- "name": "stdout",
101
- "output_type": "stream",
102
- "text": [
103
- "['REV_QTR', 'ANSWER']\n",
104
- "Empty DataFrame\n",
105
- "Columns: [0.241955862]\n",
106
- "Index: []\n"
107
- ]
108
- }
109
- ],
84
+ "outputs": [],
110
85
  "source": [
111
86
  "###\n",
112
87
  "### Your agent loop goes here, using *prompt* and *input_xlsx_file*, returns *output_xlsx_file*\n",
@@ -129,15 +104,7 @@
129
104
  "cell_type": "code",
130
105
  "execution_count": null,
131
106
  "metadata": {},
132
- "outputs": [
133
- {
134
- "name": "stdout",
135
- "output_type": "stream",
136
- "text": [
137
- "Reward: 1.0\n"
138
- ]
139
- }
140
- ],
107
+ "outputs": [],
141
108
  "source": [
142
109
  "# Get the base64 encoded xlsx file\n",
143
110
  "base64_output_xlsx_file = base64.b64encode(output_xlsx_file).decode(\"utf-8\")\n",
@@ -174,7 +141,7 @@
174
141
  },
175
142
  {
176
143
  "cell_type": "code",
177
- "execution_count": 64,
144
+ "execution_count": 3,
178
145
  "metadata": {},
179
146
  "outputs": [],
180
147
  "source": [
@@ -185,11 +152,11 @@
185
152
  },
186
153
  {
187
154
  "cell_type": "code",
188
- "execution_count": 69,
155
+ "execution_count": 17,
189
156
  "metadata": {},
190
157
  "outputs": [],
191
158
  "source": [
192
- "async def run_single_task(task):\n",
159
+ "async def run_single_task(task, job=None):\n",
193
160
  " prompt = task.prompt\n",
194
161
  " input_xlsx_file = requests.get(task.setup.args[0]).content\n",
195
162
  "\n",
@@ -202,7 +169,7 @@
202
169
  " # Run evaluation\n",
203
170
  " task.setup = (\"sheets_from_bytes\", base64_output_xlsx_file)\n",
204
171
  " task.id = None\n",
205
- " env = await gym.make(task)\n",
172
+ " env = await gym.make(task, job=job)\n",
206
173
  " result = await env.evaluate()\n",
207
174
  " await env.close()\n",
208
175
  "\n",
@@ -211,25 +178,22 @@
211
178
  },
212
179
  {
213
180
  "cell_type": "code",
214
- "execution_count": 68,
181
+ "execution_count": null,
215
182
  "metadata": {},
216
- "outputs": [
217
- {
218
- "name": "stdout",
219
- "output_type": "stream",
220
- "text": [
221
- "Average reward: 0.0\n"
222
- ]
223
- }
224
- ],
183
+ "outputs": [],
225
184
  "source": [
226
185
  "# Loading and evaluating 50 tasks should take around 2 minutes, without the agent loop\n",
227
186
  "import asyncio\n",
228
187
  "\n",
188
+ "# Adds the job to the app.hud.so platform, optional\n",
189
+ "from hud import create_job\n",
190
+ "\n",
191
+ "# Run the taskset\n",
229
192
  "taskset = await load_taskset(\"SheetBench-50\", metadata={\"partial\": True})\n",
193
+ "job = await create_job(\"SheetBench-50-Excel-Agent\", evalset_id=taskset.id)\n",
230
194
  "\n",
231
- "task_jobs = [run_single_task(task) for task in taskset]\n",
232
- "rewards = await asyncio.gather(*task_jobs)\n",
195
+ "task_runs = [run_single_task(task, job) for task in taskset]\n",
196
+ "rewards = await asyncio.gather(*task_runs)\n",
233
197
  "\n",
234
198
  "print(f\"Average reward: {sum(rewards) / len(rewards)}\")"
235
199
  ]