hud-python 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.2.5 → hud_python-0.2.6}/PKG-INFO +18 -18
- {hud_python-0.2.5 → hud_python-0.2.6}/README.md +17 -17
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/environment.mdx +0 -2
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/docs.json +0 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environment-creation.mdx +23 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/quickstart.mdx +5 -2
- hud_python-0.2.6/examples/osworld.ipynb +199 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/tasks.ipynb +4 -11
- hud_python-0.2.6/examples/wordle_example.ipynb +244 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/claude_plays_pokemon.py +2 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/remote_docker_client.py +2 -2
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/job.py +9 -9
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/requests.py +26 -4
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/settings.py +1 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/taskset.py +16 -4
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/context.py +33 -57
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/mcp.py +0 -3
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/test_context.py +7 -3
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/types.py +1 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/version.py +1 -1
- {hud_python-0.2.5 → hud_python-0.2.6}/pyproject.toml +1 -1
- hud_python-0.2.5/docs/environments/ubuntu.mdx +0 -118
- hud_python-0.2.5/examples/osworld.ipynb +0 -240
- {hud_python-0.2.5 → hud_python-0.2.6}/.env.example +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/.github/workflows/ci.yml +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/.github/workflows/release.yml +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/.gitignore +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/LICENSE +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/MANIFEST.in +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/cla-details.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/environment-control.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/tracing.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/advanced/uploading.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/env.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/gym.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/job.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/task.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/taskset.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/telemetry.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/api-reference/trajectory.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/agent.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/job.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/task.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/concepts/trajectory.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/browser.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/custom-environments.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/custom.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/osworld-ubuntu.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/environments/qa.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/alignment-evaluation.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/benchmarking-agents.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/custom-os-env.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/mcp-agent-tracing.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/examples/web-app-testing.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/favicon.png +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/logo/hud_logo.svg +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/logo/hud_logo_dark.svg +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/running-your-agent.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/docs/task-creation.mdx +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/Dockerfile +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/pyproject.toml +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/Dockerfile +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/pyproject.toml +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/Dockerfile +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/pyproject.toml +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/info.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/README.md +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/browser_use.ipynb +0 -0
- /hud_python-0.2.5/examples/example.ipynb → /hud_python-0.2.6/examples/custom_task_example.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/jobs.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/local.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/mcp_test.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/pokemon_local.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/pokemon_remote.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/examples/remote.ipynb +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/claude/tests/test_adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/tests/test_adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/common/types.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/adapters/operator/tests/test_adapter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/base.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/claude.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/langchain.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/misc/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/misc/response_agent.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/operator.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/agent/tests/test_base.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/client.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/docker_client.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/environment.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/local_docker_client.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/env/remote_client.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/base.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/inspect.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/judge.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/match.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/remote.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_inspect.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_judge.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_match.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/evaluators/tests/test_remote.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/exceptions.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/gym.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/py.typed +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/server/tests/test_requests.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/task.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/_trace.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/exporter.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/instrumentation/registry.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/mcp_models.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/trajectory.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/common.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/config.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/misc.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/progress.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/telemetry.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_common.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_config.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.2.5 → hud_python-0.2.6}/hud/utils/tests/test_telemetry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -74,17 +74,17 @@ Description-Content-Type: text/markdown
|
|
|
74
74
|
</div>
|
|
75
75
|
|
|
76
76
|
<h3>
|
|
77
|
-
|
|
77
|
+
Evaluate your Computer Use AI agents across web browsers, desktop environments, and custom scenarios.
|
|
78
78
|
</h3>
|
|
79
79
|
|
|
80
|
-
|
|
81
|
-
>
|
|
82
|
-
> [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
83
|
-
>
|
|
84
|
-
> We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
80
|
+
### 🚀 Are you a startup building agents?
|
|
85
81
|
|
|
82
|
+
[📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
86
83
|
|
|
87
|
-
|
|
84
|
+
We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
> **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
|
|
88
88
|
|
|
89
89
|
[](https://pypi.org/project/hud-python/)
|
|
90
90
|
|
|
@@ -132,23 +132,23 @@ with hud.trace("my-agent-run"):
|
|
|
132
132
|
result = await agent.run(task)
|
|
133
133
|
```
|
|
134
134
|
|
|
135
|
-
##
|
|
136
|
-
|
|
137
|
-
Before getting started, you'll need to obtain an API key:
|
|
135
|
+
## Quick Start
|
|
138
136
|
|
|
139
|
-
|
|
140
|
-
2. Set it in your environment or .env file:
|
|
137
|
+
### Installation
|
|
141
138
|
|
|
142
139
|
```bash
|
|
143
|
-
|
|
140
|
+
pip install hud-python
|
|
144
141
|
```
|
|
145
142
|
|
|
146
|
-
|
|
143
|
+
### API Key Setup
|
|
147
144
|
|
|
148
|
-
|
|
145
|
+
Before getting started, you'll need to obtain an API key:
|
|
146
|
+
|
|
147
|
+
1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
|
|
148
|
+
2. Set it in your environment or .env file:
|
|
149
149
|
|
|
150
150
|
```bash
|
|
151
|
-
|
|
151
|
+
export HUD_API_KEY=your_api_key_here
|
|
152
152
|
```
|
|
153
153
|
|
|
154
154
|
### Simple Browser Example with Claude Computer Use
|
|
@@ -269,4 +269,4 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
269
269
|
url = {https://github.com/hud-evals/hud-sdk},
|
|
270
270
|
langid = {en}
|
|
271
271
|
}
|
|
272
|
-
```
|
|
272
|
+
```
|
|
@@ -3,17 +3,17 @@
|
|
|
3
3
|
</div>
|
|
4
4
|
|
|
5
5
|
<h3>
|
|
6
|
-
|
|
6
|
+
Evaluate your Computer Use AI agents across web browsers, desktop environments, and custom scenarios.
|
|
7
7
|
</h3>
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
>
|
|
11
|
-
> [📅 Hop on a call ](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
12
|
-
>
|
|
13
|
-
> We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
9
|
+
### 🚀 Are you a startup building agents?
|
|
14
10
|
|
|
11
|
+
[📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
|
|
15
12
|
|
|
16
|
-
|
|
13
|
+
We're here to help with eval strategies, custom environments, or improving your agent architecture!
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
> **Early Release Notice**: We'd love to hear your feedback in [Issues](https://github.com/hud-evals/hud-sdk/issues), as the SDK is still evolving!
|
|
17
17
|
|
|
18
18
|
[](https://pypi.org/project/hud-python/)
|
|
19
19
|
|
|
@@ -61,23 +61,23 @@ with hud.trace("my-agent-run"):
|
|
|
61
61
|
result = await agent.run(task)
|
|
62
62
|
```
|
|
63
63
|
|
|
64
|
-
##
|
|
65
|
-
|
|
66
|
-
Before getting started, you'll need to obtain an API key:
|
|
64
|
+
## Quick Start
|
|
67
65
|
|
|
68
|
-
|
|
69
|
-
2. Set it in your environment or .env file:
|
|
66
|
+
### Installation
|
|
70
67
|
|
|
71
68
|
```bash
|
|
72
|
-
|
|
69
|
+
pip install hud-python
|
|
73
70
|
```
|
|
74
71
|
|
|
75
|
-
|
|
72
|
+
### API Key Setup
|
|
76
73
|
|
|
77
|
-
|
|
74
|
+
Before getting started, you'll need to obtain an API key:
|
|
75
|
+
|
|
76
|
+
1. Visit [app.hud.so](https://app.hud.so) to create a free account and generate your API key
|
|
77
|
+
2. Set it in your environment or .env file:
|
|
78
78
|
|
|
79
79
|
```bash
|
|
80
|
-
|
|
80
|
+
export HUD_API_KEY=your_api_key_here
|
|
81
81
|
```
|
|
82
82
|
|
|
83
83
|
### Simple Browser Example with Claude Computer Use
|
|
@@ -198,4 +198,4 @@ If you use this SDK in your research, please cite it as follows:
|
|
|
198
198
|
url = {https://github.com/hud-evals/hud-sdk},
|
|
199
199
|
langid = {en}
|
|
200
200
|
}
|
|
201
|
-
```
|
|
201
|
+
```
|
|
@@ -54,8 +54,6 @@ The HUD SDK provides several standard environment types, specified via the `gym`
|
|
|
54
54
|
|
|
55
55
|
* **`"hud-browser"`**: Provides a remote Chromium browser instance managed via Playwright. Ideal for web navigation, form interaction, and testing web applications.
|
|
56
56
|
* [See `hud-browser` Details](../environments/hud-browser.mdx)
|
|
57
|
-
* **`"hud-ubuntu"`**: Provides a remote Ubuntu desktop environment accessed via VNC. Suitable for tasks involving GUI applications, file system interaction, or running Linux software.
|
|
58
|
-
* [See `hud-ubuntu` Details](../environments/hud-ubuntu.mdx)
|
|
59
57
|
* **`"qa"`**: A non-interactive environment for question-answering tasks where the agent provides a direct textual response.
|
|
60
58
|
* [See `qa` Environment Details](../environments/qa.mdx)
|
|
61
59
|
* **`CustomGym`**: Allows defining and running your own [Custom Environments](../advanced/custom-environments.mdx) using Docker, either locally or remotely. This provides maximum flexibility for specific testing needs.
|
|
@@ -63,8 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
63
63
|
# ... other system dependencies for your environment (e.g., desktop, browsers) ...
|
|
64
64
|
&& rm -rf /var/lib/apt/lists/*
|
|
65
65
|
|
|
66
|
+
# Upgrade pip and setuptools to ensure PEP 660 support
|
|
67
|
+
RUN pip3 install --upgrade pip setuptools>=64.0.0 wheel
|
|
68
|
+
|
|
66
69
|
# Copy your controller source code
|
|
67
70
|
WORKDIR /app
|
|
71
|
+
RUN mkdir /app_data
|
|
72
|
+
|
|
68
73
|
COPY ./src /app/src
|
|
69
74
|
COPY ./pyproject.toml /app/
|
|
70
75
|
|
|
@@ -92,7 +97,7 @@ dependencies = [
|
|
|
92
97
|
]
|
|
93
98
|
|
|
94
99
|
[build-system]
|
|
95
|
-
requires = ["setuptools>=
|
|
100
|
+
requires = ["setuptools>=64.0.0", "wheel"]
|
|
96
101
|
build-backend = "setuptools.build_meta"
|
|
97
102
|
|
|
98
103
|
[project.scripts]
|
|
@@ -135,9 +140,26 @@ def verify_output_file(expected_content: str) -> float:
|
|
|
135
140
|
logger.error("Evaluation failed: Output file not found.")
|
|
136
141
|
return 0.0 # Failure
|
|
137
142
|
|
|
143
|
+
def step(action: str) -> str:
|
|
144
|
+
"""Example step function for a Task."""
|
|
145
|
+
logger.info(f"Controller: Stepping with {action=}")
|
|
146
|
+
|
|
147
|
+
return {
|
|
148
|
+
"observation": {
|
|
149
|
+
"text": "Sample Text",
|
|
150
|
+
"screenshot": None
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
138
154
|
# You can add more functions as needed for different setup/evaluation logic
|
|
139
155
|
```
|
|
140
156
|
|
|
157
|
+
### d. `src/hud_controller/__init__.py`
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from .main import initialize_environment, verify_output_file, step
|
|
161
|
+
```
|
|
162
|
+
|
|
141
163
|
## 4. Building & Testing Locally
|
|
142
164
|
|
|
143
165
|
### a. Define `CustomGym`
|
|
@@ -15,7 +15,7 @@ See [Installation](/installation) for more details on development setup.
|
|
|
15
15
|
|
|
16
16
|
## 2. API Key Setup
|
|
17
17
|
|
|
18
|
-
Set your API keys in a `.env` file:
|
|
18
|
+
Set your API keys in a `.env` file (get your HUD API key from [app.hud.so](https://app.hud.so)):
|
|
19
19
|
|
|
20
20
|
```bash
|
|
21
21
|
HUD_API_KEY=sk-hud-...
|
|
@@ -51,9 +51,12 @@ async def main():
|
|
|
51
51
|
await env.close()
|
|
52
52
|
|
|
53
53
|
if __name__ == "__main__":
|
|
54
|
-
|
|
54
|
+
asyncio.run(main())
|
|
55
55
|
```
|
|
56
56
|
|
|
57
|
+
Each gym (`hud-browser`, `OSWorld-Ubuntu`, custom) has it's own set of setup and evaluate funcitons, and you can define your own.
|
|
58
|
+
See [setup](/environments/browser#setup-functions-initial-state) and [evalutors](/environments/browser#evaluation-functions) for more info on available functions.
|
|
59
|
+
|
|
57
60
|
### Manual Agent Loop
|
|
58
61
|
```python
|
|
59
62
|
env = await gym.make(task)
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 1,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"# uv pip install -e \".[dev]\"\n",
|
|
10
|
+
"from hud import gym, load_taskset\n",
|
|
11
|
+
"from pprint import pprint"
|
|
12
|
+
]
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"cell_type": "code",
|
|
16
|
+
"execution_count": 2,
|
|
17
|
+
"metadata": {},
|
|
18
|
+
"outputs": [
|
|
19
|
+
{
|
|
20
|
+
"name": "stdout",
|
|
21
|
+
"output_type": "stream",
|
|
22
|
+
"text": [
|
|
23
|
+
"Total tasks in OSWorld: 369\n",
|
|
24
|
+
"Task prompt: Can you make my computer bring back the last tab I shut down?\n"
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
],
|
|
28
|
+
"source": [
|
|
29
|
+
"taskset = await load_taskset(\"OSWorld-Ubuntu\")\n",
|
|
30
|
+
"print(f\"Total tasks in OSWorld: {len(taskset)}\")\n",
|
|
31
|
+
"\n",
|
|
32
|
+
"test = taskset[144]\n",
|
|
33
|
+
"print(f\"Task prompt: {test.prompt}\")"
|
|
34
|
+
]
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"cell_type": "code",
|
|
38
|
+
"execution_count": 3,
|
|
39
|
+
"metadata": {},
|
|
40
|
+
"outputs": [
|
|
41
|
+
{
|
|
42
|
+
"name": "stderr",
|
|
43
|
+
"output_type": "stream",
|
|
44
|
+
"text": [
|
|
45
|
+
"2025-05-27 10:04:56,691 - hud.gym - INFO - Creating private environment\n"
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
],
|
|
49
|
+
"source": [
|
|
50
|
+
"# The Ubuntu environment will take around 2.5 minutes to start, but can be parallelized\n",
|
|
51
|
+
"env = await gym.make(test)"
|
|
52
|
+
]
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"cell_type": "code",
|
|
56
|
+
"execution_count": 4,
|
|
57
|
+
"metadata": {},
|
|
58
|
+
"outputs": [
|
|
59
|
+
{
|
|
60
|
+
"name": "stdout",
|
|
61
|
+
"output_type": "stream",
|
|
62
|
+
"text": [
|
|
63
|
+
"Initial observation complete\n",
|
|
64
|
+
"========= Step 1 =========\n",
|
|
65
|
+
"Agent's action: [PressAction(type='press', keys=['ctrl', 'shift', 't'])]\n",
|
|
66
|
+
"========= Step 2 =========\n",
|
|
67
|
+
"Agent's action: [ResponseAction(type='response', text=\"Great! I've successfully reopened your last closed tab. As you can see, the TripAdvisor tab has been restored. Now you have three tabs open:\\n\\n1. Lonely Planet | Travel Guide\\n2. Airbnb | Vacation rentals\\n3. TripAdvisor: Over a billion reviews & contributions for Hotels\\n\\nThe keyboard shortcut Ctrl+Shift+T is very useful for recovering recently closed tabs in Chrome. You can actually press it multiple times to continue reopening previously closed tabs in the order they were closed.\")]\n"
|
|
68
|
+
]
|
|
69
|
+
}
|
|
70
|
+
],
|
|
71
|
+
"source": [
|
|
72
|
+
"from hud.agent import ClaudeAgent\n",
|
|
73
|
+
"\n",
|
|
74
|
+
"# Define a new agent each time to reset the message history\n",
|
|
75
|
+
"# Make sure to define the environment variable ANTHROPIC_API_KEY\n",
|
|
76
|
+
"agent = ClaudeAgent()\n",
|
|
77
|
+
"\n",
|
|
78
|
+
"# Initial observation\n",
|
|
79
|
+
"obs, _ = await env.reset()\n",
|
|
80
|
+
"print(f\"Initial observation complete\")\n",
|
|
81
|
+
"\n",
|
|
82
|
+
"# Agent loop\n",
|
|
83
|
+
"for i in range(8):\n",
|
|
84
|
+
" print(f\"========= Step {i + 1} =========\")\n",
|
|
85
|
+
" action, done = await agent.predict(obs)\n",
|
|
86
|
+
" print(f\"Agent's action: {action}\")\n",
|
|
87
|
+
"\n",
|
|
88
|
+
" obs, reward, terminated, info = await env.step(action)\n",
|
|
89
|
+
"\n",
|
|
90
|
+
" if done or terminated:\n",
|
|
91
|
+
" break"
|
|
92
|
+
]
|
|
93
|
+
},
|
|
94
|
+
{
|
|
95
|
+
"cell_type": "code",
|
|
96
|
+
"execution_count": 5,
|
|
97
|
+
"metadata": {},
|
|
98
|
+
"outputs": [
|
|
99
|
+
{
|
|
100
|
+
"name": "stdout",
|
|
101
|
+
"output_type": "stream",
|
|
102
|
+
"text": [
|
|
103
|
+
"{'error': None,\n",
|
|
104
|
+
" 'logs': 'INFO: Starting evaluation...\\n'\n",
|
|
105
|
+
" 'INFO: Evaluating task 08d9a8b1-7b7a-4ba7-a226-4e266e13f6df...\\n'\n",
|
|
106
|
+
" 'INFO: Evaluator configuration:\\n'\n",
|
|
107
|
+
" 'INFO: Metric function(s): is_expected_tabs\\n'\n",
|
|
108
|
+
" 'INFO: Metric conjunction: and\\n'\n",
|
|
109
|
+
" 'INFO: Result getter: get_open_tabs_info\\n'\n",
|
|
110
|
+
" 'INFO: Expected getter: get_rule\\n'\n",
|
|
111
|
+
" 'INFO: Metric options: {}\\n'\n",
|
|
112
|
+
" 'INFO: Setting up post-config for evaluation...\\n'\n",
|
|
113
|
+
" 'INFO: Evaluating single metric: is_expected_tabs\\n'\n",
|
|
114
|
+
" \"INFO: Getting result state using config: {'type': 'open_tabs_info'}\\n\"\n",
|
|
115
|
+
" \"INFO: Getting expected state using config: {'type': 'rule', 'rules': \"\n",
|
|
116
|
+
" \"{'type': 'url', 'urls': ['https://www.lonelyplanet.com', \"\n",
|
|
117
|
+
" \"'https://www.airbnb.com', 'https://www.tripadvisor.com']}}\\n\"\n",
|
|
118
|
+
" 'INFO: Comparing result state with expected state\\n'\n",
|
|
119
|
+
" 'INFO: Final evaluation result: 1\\n'\n",
|
|
120
|
+
" 'INFO: Completed evaluation.\\n'\n",
|
|
121
|
+
" 'INFO: Completed evaluation.\\n',\n",
|
|
122
|
+
" 'reward': 1.0}\n"
|
|
123
|
+
]
|
|
124
|
+
}
|
|
125
|
+
],
|
|
126
|
+
"source": [
|
|
127
|
+
"# Evaluate environment state\n",
|
|
128
|
+
"result = await env.evaluate()\n",
|
|
129
|
+
"pprint(result)"
|
|
130
|
+
]
|
|
131
|
+
},
|
|
132
|
+
{
|
|
133
|
+
"cell_type": "code",
|
|
134
|
+
"execution_count": 6,
|
|
135
|
+
"metadata": {},
|
|
136
|
+
"outputs": [],
|
|
137
|
+
"source": [
|
|
138
|
+
"# Make sure to close environment to avoid being charged for idle time\n",
|
|
139
|
+
"await env.close()"
|
|
140
|
+
]
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
"cell_type": "markdown",
|
|
144
|
+
"metadata": {},
|
|
145
|
+
"source": [
|
|
146
|
+
"Paralell runs for the whole dataset"
|
|
147
|
+
]
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
"cell_type": "code",
|
|
151
|
+
"execution_count": 26,
|
|
152
|
+
"metadata": {},
|
|
153
|
+
"outputs": [],
|
|
154
|
+
"source": [
|
|
155
|
+
"from hud import run_job\n",
|
|
156
|
+
"\n",
|
|
157
|
+
"taskset = await load_taskset(\"OSWorld-Ubuntu\")\n",
|
|
158
|
+
"job = await run_job(\n",
|
|
159
|
+
" ClaudeAgent,\n",
|
|
160
|
+
" taskset,\n",
|
|
161
|
+
" \"osworld-test\",\n",
|
|
162
|
+
" max_steps_per_task=20,\n",
|
|
163
|
+
" max_concurrent_tasks=20,\n",
|
|
164
|
+
" auto_reply_question=True,\n",
|
|
165
|
+
")"
|
|
166
|
+
]
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
"cell_type": "code",
|
|
170
|
+
"execution_count": null,
|
|
171
|
+
"metadata": {},
|
|
172
|
+
"outputs": [],
|
|
173
|
+
"source": [
|
|
174
|
+
"await job.get_analytics()"
|
|
175
|
+
]
|
|
176
|
+
}
|
|
177
|
+
],
|
|
178
|
+
"metadata": {
|
|
179
|
+
"kernelspec": {
|
|
180
|
+
"display_name": ".venv",
|
|
181
|
+
"language": "python",
|
|
182
|
+
"name": "python3"
|
|
183
|
+
},
|
|
184
|
+
"language_info": {
|
|
185
|
+
"codemirror_mode": {
|
|
186
|
+
"name": "ipython",
|
|
187
|
+
"version": 3
|
|
188
|
+
},
|
|
189
|
+
"file_extension": ".py",
|
|
190
|
+
"mimetype": "text/x-python",
|
|
191
|
+
"name": "python",
|
|
192
|
+
"nbconvert_exporter": "python",
|
|
193
|
+
"pygments_lexer": "ipython3",
|
|
194
|
+
"version": "3.12.9"
|
|
195
|
+
}
|
|
196
|
+
},
|
|
197
|
+
"nbformat": 4,
|
|
198
|
+
"nbformat_minor": 2
|
|
199
|
+
}
|
|
@@ -7,7 +7,6 @@
|
|
|
7
7
|
"outputs": [],
|
|
8
8
|
"source": [
|
|
9
9
|
"from hud import gym\n",
|
|
10
|
-
"from hud.utils import stream\n",
|
|
11
10
|
"from hud.task import Task"
|
|
12
11
|
]
|
|
13
12
|
},
|
|
@@ -41,10 +40,7 @@
|
|
|
41
40
|
"source": [
|
|
42
41
|
"# Create and set up environment with google, takes around 20 seconds\n",
|
|
43
42
|
"env = await gym.make(task)\n",
|
|
44
|
-
"
|
|
45
|
-
"\n",
|
|
46
|
-
"# Stream the live view\n",
|
|
47
|
-
"stream(urls[\"live_url\"])"
|
|
43
|
+
"await env.stream()"
|
|
48
44
|
]
|
|
49
45
|
},
|
|
50
46
|
{
|
|
@@ -127,10 +123,7 @@
|
|
|
127
123
|
"source": [
|
|
128
124
|
"# Create and set up environment with google, takes around 20 seconds\n",
|
|
129
125
|
"env = await gym.make(task)\n",
|
|
130
|
-
"
|
|
131
|
-
"\n",
|
|
132
|
-
"# Stream the live view\n",
|
|
133
|
-
"stream(urls[\"live_url\"])"
|
|
126
|
+
"await env.stream()"
|
|
134
127
|
]
|
|
135
128
|
},
|
|
136
129
|
{
|
|
@@ -217,9 +210,9 @@
|
|
|
217
210
|
"metadata": {},
|
|
218
211
|
"outputs": [],
|
|
219
212
|
"source": [
|
|
220
|
-
"from hud
|
|
213
|
+
"from hud import Response\n",
|
|
221
214
|
"\n",
|
|
222
|
-
"await env.step([
|
|
215
|
+
"await env.step([Response(text=\"Paris\")])"
|
|
223
216
|
]
|
|
224
217
|
},
|
|
225
218
|
{
|