hud-python 0.2.7__tar.gz → 0.2.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.2.7 → hud_python-0.2.9}/PKG-INFO +1 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/sheetbench_direct_example.ipynb +20 -56
- hud_python-0.2.9/examples/sheets_bench_cua_example.ipynb +360 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/types.py +2 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/adapter.py +4 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/operator.py +22 -3
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/docker_client.py +2 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/job.py +4 -4
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/version.py +1 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/pyproject.toml +1 -1
- {hud_python-0.2.7 → hud_python-0.2.9}/.env.example +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/.github/workflows/ci.yml +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/.github/workflows/release.yml +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/.gitignore +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/LICENSE +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/MANIFEST.in +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/README.md +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/cla-details.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/environment-control.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/tracing.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/advanced/uploading.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/env.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/gym.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/job.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/task.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/taskset.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/telemetry.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/api-reference/trajectory.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/agent.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/environment.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/job.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/task.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/concepts/trajectory.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/docs.json +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environment-creation.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/browser.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/custom-environments.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/custom.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/osworld-ubuntu.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/environments/qa.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/alignment-evaluation.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/benchmarking-agents.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/custom-os-env.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/mcp-agent-tracing.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/web-app-testing.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/examples/web-mocks.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/favicon.png +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/logo/hud_logo.svg +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/logo/hud_logo_dark.svg +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/quickstart.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/running-your-agent.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/docs/task-creation.mdx +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/Dockerfile +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/pyproject.toml +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/Dockerfile +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/pyproject.toml +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/Dockerfile +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/pyproject.toml +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/info.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/README.md +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/appflowy.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/browser_use.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/custom_task_example.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/jobs.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/local.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/mcp_test.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/osworld.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/pokemon_local.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/pokemon_remote.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/remote.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/sensitive_data.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/tasks.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/examples/wordle_example.ipynb +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/adapter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/claude/tests/test_adapter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/adapter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/common/tests/test_adapter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/adapters/operator/tests/test_adapter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/base.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/claude.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/claude_plays_pokemon.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/langchain.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/misc/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/misc/response_agent.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/agent/tests/test_base.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/client.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/environment.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/local_docker_client.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/remote_client.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/env/remote_docker_client.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/base.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/inspect.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/judge.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/match.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/remote.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_inspect.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_judge.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_match.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/evaluators/tests/test_remote.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/exceptions.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/gym.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/py.typed +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/requests.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/server/tests/test_requests.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/settings.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/task.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/taskset.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/_trace.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/context.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/mcp.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/instrumentation/registry.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/mcp_models.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/test_context.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/trajectory.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/types.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/agent.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/common.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/config.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/misc.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/progress.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/telemetry.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_common.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_config.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.2.7 → hud_python-0.2.9}/hud/utils/tests/test_telemetry.py +0 -0
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
"export HUD_API_KEY=your_api_key_here\n",
|
|
15
15
|
"pip install hud-python\n",
|
|
16
16
|
"pip install pandas # optional for debugging\n",
|
|
17
|
+
"pip install openpyxl # also a requirement for pandas\n",
|
|
17
18
|
"```"
|
|
18
19
|
]
|
|
19
20
|
},
|
|
@@ -26,7 +27,7 @@
|
|
|
26
27
|
},
|
|
27
28
|
{
|
|
28
29
|
"cell_type": "code",
|
|
29
|
-
"execution_count":
|
|
30
|
+
"execution_count": 2,
|
|
30
31
|
"metadata": {},
|
|
31
32
|
"outputs": [],
|
|
32
33
|
"source": [
|
|
@@ -51,24 +52,9 @@
|
|
|
51
52
|
},
|
|
52
53
|
{
|
|
53
54
|
"cell_type": "code",
|
|
54
|
-
"execution_count":
|
|
55
|
+
"execution_count": null,
|
|
55
56
|
"metadata": {},
|
|
56
|
-
"outputs": [
|
|
57
|
-
{
|
|
58
|
-
"name": "stdout",
|
|
59
|
-
"output_type": "stream",
|
|
60
|
-
"text": [
|
|
61
|
-
"Prompt: Sheet REV_QTR lists quarterly revenue from Q1-2022 through Q1-2025. Compute the compound annual growth rate between those two points. Enter the result in ANSWER!A1.\n",
|
|
62
|
-
"['REV_QTR']\n",
|
|
63
|
-
" Quarter Revenue\n",
|
|
64
|
-
"0 Q1-2022 4200000\n",
|
|
65
|
-
"1 Q2-2022 4404653\n",
|
|
66
|
-
"2 Q3-2022 4746171\n",
|
|
67
|
-
"3 Q4-2022 5062265\n",
|
|
68
|
-
"4 Q1-2023 5365661\n"
|
|
69
|
-
]
|
|
70
|
-
}
|
|
71
|
-
],
|
|
57
|
+
"outputs": [],
|
|
72
58
|
"source": [
|
|
73
59
|
"# Load in the first task for testing\n",
|
|
74
60
|
"test_task = taskset[1]\n",
|
|
@@ -93,20 +79,9 @@
|
|
|
93
79
|
},
|
|
94
80
|
{
|
|
95
81
|
"cell_type": "code",
|
|
96
|
-
"execution_count":
|
|
82
|
+
"execution_count": null,
|
|
97
83
|
"metadata": {},
|
|
98
|
-
"outputs": [
|
|
99
|
-
{
|
|
100
|
-
"name": "stdout",
|
|
101
|
-
"output_type": "stream",
|
|
102
|
-
"text": [
|
|
103
|
-
"['REV_QTR', 'ANSWER']\n",
|
|
104
|
-
"Empty DataFrame\n",
|
|
105
|
-
"Columns: [0.241955862]\n",
|
|
106
|
-
"Index: []\n"
|
|
107
|
-
]
|
|
108
|
-
}
|
|
109
|
-
],
|
|
84
|
+
"outputs": [],
|
|
110
85
|
"source": [
|
|
111
86
|
"###\n",
|
|
112
87
|
"### Your agent loop goes here, using *prompt* and *input_xlsx_file*, returns *output_xlsx_file*\n",
|
|
@@ -129,15 +104,7 @@
|
|
|
129
104
|
"cell_type": "code",
|
|
130
105
|
"execution_count": null,
|
|
131
106
|
"metadata": {},
|
|
132
|
-
"outputs": [
|
|
133
|
-
{
|
|
134
|
-
"name": "stdout",
|
|
135
|
-
"output_type": "stream",
|
|
136
|
-
"text": [
|
|
137
|
-
"Reward: 1.0\n"
|
|
138
|
-
]
|
|
139
|
-
}
|
|
140
|
-
],
|
|
107
|
+
"outputs": [],
|
|
141
108
|
"source": [
|
|
142
109
|
"# Get the base64 encoded xlsx file\n",
|
|
143
110
|
"base64_output_xlsx_file = base64.b64encode(output_xlsx_file).decode(\"utf-8\")\n",
|
|
@@ -174,7 +141,7 @@
|
|
|
174
141
|
},
|
|
175
142
|
{
|
|
176
143
|
"cell_type": "code",
|
|
177
|
-
"execution_count":
|
|
144
|
+
"execution_count": 3,
|
|
178
145
|
"metadata": {},
|
|
179
146
|
"outputs": [],
|
|
180
147
|
"source": [
|
|
@@ -185,11 +152,11 @@
|
|
|
185
152
|
},
|
|
186
153
|
{
|
|
187
154
|
"cell_type": "code",
|
|
188
|
-
"execution_count":
|
|
155
|
+
"execution_count": 17,
|
|
189
156
|
"metadata": {},
|
|
190
157
|
"outputs": [],
|
|
191
158
|
"source": [
|
|
192
|
-
"async def run_single_task(task):\n",
|
|
159
|
+
"async def run_single_task(task, job=None):\n",
|
|
193
160
|
" prompt = task.prompt\n",
|
|
194
161
|
" input_xlsx_file = requests.get(task.setup.args[0]).content\n",
|
|
195
162
|
"\n",
|
|
@@ -202,7 +169,7 @@
|
|
|
202
169
|
" # Run evaluation\n",
|
|
203
170
|
" task.setup = (\"sheets_from_bytes\", base64_output_xlsx_file)\n",
|
|
204
171
|
" task.id = None\n",
|
|
205
|
-
" env = await gym.make(task)\n",
|
|
172
|
+
" env = await gym.make(task, job=job)\n",
|
|
206
173
|
" result = await env.evaluate()\n",
|
|
207
174
|
" await env.close()\n",
|
|
208
175
|
"\n",
|
|
@@ -211,25 +178,22 @@
|
|
|
211
178
|
},
|
|
212
179
|
{
|
|
213
180
|
"cell_type": "code",
|
|
214
|
-
"execution_count":
|
|
181
|
+
"execution_count": null,
|
|
215
182
|
"metadata": {},
|
|
216
|
-
"outputs": [
|
|
217
|
-
{
|
|
218
|
-
"name": "stdout",
|
|
219
|
-
"output_type": "stream",
|
|
220
|
-
"text": [
|
|
221
|
-
"Average reward: 0.0\n"
|
|
222
|
-
]
|
|
223
|
-
}
|
|
224
|
-
],
|
|
183
|
+
"outputs": [],
|
|
225
184
|
"source": [
|
|
226
185
|
"# Loading and evaluating 50 tasks should take around 2 minutes, without the agent loop\n",
|
|
227
186
|
"import asyncio\n",
|
|
228
187
|
"\n",
|
|
188
|
+
"# Adds the job to the app.hud.so platform, optional\n",
|
|
189
|
+
"from hud import create_job\n",
|
|
190
|
+
"\n",
|
|
191
|
+
"# Run the taskset\n",
|
|
229
192
|
"taskset = await load_taskset(\"SheetBench-50\", metadata={\"partial\": True})\n",
|
|
193
|
+
"job = await create_job(\"SheetBench-50-Excel-Agent\", evalset_id=taskset.id)\n",
|
|
230
194
|
"\n",
|
|
231
|
-
"
|
|
232
|
-
"rewards = await asyncio.gather(*
|
|
195
|
+
"task_runs = [run_single_task(task, job) for task in taskset]\n",
|
|
196
|
+
"rewards = await asyncio.gather(*task_runs)\n",
|
|
233
197
|
"\n",
|
|
234
198
|
"print(f\"Average reward: {sum(rewards) / len(rewards)}\")"
|
|
235
199
|
]
|