hud-python 0.2.6__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.2.6 → hud_python-0.2.7}/PKG-INFO +9 -6
- {hud_python-0.2.6 → hud_python-0.2.7}/README.md +6 -4
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/docs.json +1 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environment-creation.mdx +2 -2
- hud_python-0.2.7/docs/examples/web-mocks.mdx +240 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/task-creation.mdx +4 -0
- hud_python-0.2.7/examples/appflowy.ipynb +1552 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/mcp_test.ipynb +22 -29
- hud_python-0.2.7/examples/sensitive_data.ipynb +89 -0
- hud_python-0.2.7/examples/sheetbench_direct_example.ipynb +266 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/wordle_example.ipynb +1 -1
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/__init__.py +13 -10
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/adapter.py +30 -18
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/adapter.py +0 -1
- hud_python-0.2.7/hud/adapters/common/types.py +445 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/adapter.py +23 -13
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/base.py +5 -4
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/claude.py +65 -13
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/claude_plays_pokemon.py +2 -2
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/langchain.py +8 -2
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/operator.py +36 -11
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/tests/test_base.py +2 -2
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/docker_client.py +24 -2
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/environment.py +86 -40
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/local_docker_client.py +50 -4
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/remote_client.py +22 -4
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/remote_docker_client.py +6 -2
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/gym.py +15 -4
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/job.py +91 -26
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/settings.py +6 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/task.py +84 -6
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/taskset.py +63 -8
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/exporter.py +4 -6
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/trajectory.py +3 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/types.py +28 -2
- hud_python-0.2.7/hud/utils/agent.py +37 -0
- hud_python-0.2.7/hud/utils/common.py +256 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/config.py +11 -0
- hud_python-0.2.7/hud/utils/tests/test_common.py +277 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/version.py +1 -1
- {hud_python-0.2.6 → hud_python-0.2.7}/pyproject.toml +6 -4
- hud_python-0.2.6/hud/adapters/common/types.py +0 -320
- hud_python-0.2.6/hud/utils/common.py +0 -140
- hud_python-0.2.6/hud/utils/tests/test_common.py +0 -52
- {hud_python-0.2.6 → hud_python-0.2.7}/.env.example +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/.github/workflows/ci.yml +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/.github/workflows/release.yml +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/.gitignore +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/LICENSE +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/MANIFEST.in +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/cla-details.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/environment-control.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/tracing.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/uploading.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/adapters.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/env.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/gym.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/job.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/task.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/taskset.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/telemetry.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/trajectory.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/adapter.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/agent.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/environment.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/job.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/task.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/trajectory.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/browser.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/custom-environments.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/custom.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/osworld-ubuntu.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/qa.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/alignment-evaluation.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/benchmarking-agents.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/custom-os-env.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/mcp-agent-tracing.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/web-app-testing.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/favicon.png +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/logo/hud_logo.svg +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/logo/hud_logo_dark.svg +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/quickstart.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/docs/running-your-agent.mdx +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/Dockerfile +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/pyproject.toml +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/Dockerfile +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/pyproject.toml +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/Dockerfile +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/pyproject.toml +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/info.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/step.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/README.md +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/browser_use.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/custom_task_example.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/jobs.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/local.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/osworld.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/pokemon_local.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/pokemon_remote.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/remote.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/examples/tasks.ipynb +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/tests/test_adapter.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/tests/test_adapter.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/tests/test_adapter.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/misc/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/misc/response_agent.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/client.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/base.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/inspect.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/judge.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/match.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/remote.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_inspect.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_judge.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_match.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_remote.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/exceptions.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/py.typed +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/requests.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/tests/test_requests.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/_trace.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/context.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/mcp.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/registry.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/mcp_models.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/test_context.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/misc.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/progress.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/telemetry.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_config.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_telemetry.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.7
|
|
4
4
|
Summary: SDK for the HUD evaluation platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
|
|
@@ -47,6 +47,7 @@ Requires-Dist: langchain-openai
|
|
|
47
47
|
Requires-Dist: mcp
|
|
48
48
|
Requires-Dist: numpy
|
|
49
49
|
Requires-Dist: openai
|
|
50
|
+
Requires-Dist: pathspec>=0.12.1
|
|
50
51
|
Requires-Dist: pillow>=11.1.0
|
|
51
52
|
Requires-Dist: pydantic-settings<3,>=2
|
|
52
53
|
Requires-Dist: pydantic<3,>=2
|
|
@@ -61,7 +62,7 @@ Requires-Dist: ipython<9; extra == 'dev'
|
|
|
61
62
|
Requires-Dist: jupyter-client; extra == 'dev'
|
|
62
63
|
Requires-Dist: jupyter-core; extra == 'dev'
|
|
63
64
|
Requires-Dist: openai; extra == 'dev'
|
|
64
|
-
Requires-Dist: pyright==1.1.
|
|
65
|
+
Requires-Dist: pyright==1.1.401; extra == 'dev'
|
|
65
66
|
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
66
67
|
Requires-Dist: pytest-cov; extra == 'dev'
|
|
67
68
|
Requires-Dist: pytest-mock; extra == 'dev'
|
|
@@ -90,7 +91,7 @@ We're here to help with eval strategies, custom environments, or improving your
|
|
|
90
91
|
|
|
91
92
|
## ✨ What You Can Do
|
|
92
93
|
|
|
93
|
-
**Evaluate Existing Benchmarks**
|
|
94
|
+
**[Evaluate Existing Benchmarks](https://docs.hud.so/examples/benchmarking-agents)**
|
|
94
95
|
```python
|
|
95
96
|
from hud import load_taskset, run_job, ClaudeAgent
|
|
96
97
|
|
|
@@ -98,7 +99,7 @@ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
|
|
|
98
99
|
job = await run_job(ClaudeAgent, taskset, "my-evaluation")
|
|
99
100
|
```
|
|
100
101
|
|
|
101
|
-
**Create Custom Tasks**
|
|
102
|
+
**[Create Custom Tasks](https://docs.hud.so/task-creation)**
|
|
102
103
|
```python
|
|
103
104
|
from hud.task import Task
|
|
104
105
|
|
|
@@ -110,7 +111,7 @@ task = Task(
|
|
|
110
111
|
)
|
|
111
112
|
```
|
|
112
113
|
|
|
113
|
-
**Build Custom Environments**
|
|
114
|
+
**[Build Custom Environments](https://docs.hud.so/environment-creation)**
|
|
114
115
|
```python
|
|
115
116
|
from hud.types import CustomGym
|
|
116
117
|
|
|
@@ -123,7 +124,7 @@ custom_gym = CustomGym(
|
|
|
123
124
|
# Or create complex Docker environments - see environments/ folder for examples
|
|
124
125
|
```
|
|
125
126
|
|
|
126
|
-
**Trace Tool Calls Alongside HUD Environments (or Independently)**
|
|
127
|
+
**[Trace Tool Calls Alongside HUD Environments (or Independently)](https://docs.hud.so/examples/mcp-agent-tracing)**
|
|
127
128
|
```python
|
|
128
129
|
import hud
|
|
129
130
|
|
|
@@ -171,6 +172,7 @@ async def main():
|
|
|
171
172
|
setup=("goto", "google.com"),
|
|
172
173
|
evaluate=("contains_text", "capybara")
|
|
173
174
|
)
|
|
175
|
+
print(f"Running task with prompt: {task.prompt}")
|
|
174
176
|
|
|
175
177
|
# Create environment using the gym module
|
|
176
178
|
env = await gym.make(task)
|
|
@@ -182,6 +184,7 @@ async def main():
|
|
|
182
184
|
obs, _ = await env.reset() # Gets first observation
|
|
183
185
|
for i in range(5):
|
|
184
186
|
actions, done = await agent.predict(obs)
|
|
187
|
+
print(f"Agent action {i}: {actions}")
|
|
185
188
|
|
|
186
189
|
obs, reward, terminated, info = await env.step(actions)
|
|
187
190
|
if done or terminated: break
|
|
@@ -19,7 +19,7 @@ We're here to help with eval strategies, custom environments, or improving your
|
|
|
19
19
|
|
|
20
20
|
## ✨ What You Can Do
|
|
21
21
|
|
|
22
|
-
**Evaluate Existing Benchmarks**
|
|
22
|
+
**[Evaluate Existing Benchmarks](https://docs.hud.so/examples/benchmarking-agents)**
|
|
23
23
|
```python
|
|
24
24
|
from hud import load_taskset, run_job, ClaudeAgent
|
|
25
25
|
|
|
@@ -27,7 +27,7 @@ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
|
|
|
27
27
|
job = await run_job(ClaudeAgent, taskset, "my-evaluation")
|
|
28
28
|
```
|
|
29
29
|
|
|
30
|
-
**Create Custom Tasks**
|
|
30
|
+
**[Create Custom Tasks](https://docs.hud.so/task-creation)**
|
|
31
31
|
```python
|
|
32
32
|
from hud.task import Task
|
|
33
33
|
|
|
@@ -39,7 +39,7 @@ task = Task(
|
|
|
39
39
|
)
|
|
40
40
|
```
|
|
41
41
|
|
|
42
|
-
**Build Custom Environments**
|
|
42
|
+
**[Build Custom Environments](https://docs.hud.so/environment-creation)**
|
|
43
43
|
```python
|
|
44
44
|
from hud.types import CustomGym
|
|
45
45
|
|
|
@@ -52,7 +52,7 @@ custom_gym = CustomGym(
|
|
|
52
52
|
# Or create complex Docker environments - see environments/ folder for examples
|
|
53
53
|
```
|
|
54
54
|
|
|
55
|
-
**Trace Tool Calls Alongside HUD Environments (or Independently)**
|
|
55
|
+
**[Trace Tool Calls Alongside HUD Environments (or Independently)](https://docs.hud.so/examples/mcp-agent-tracing)**
|
|
56
56
|
```python
|
|
57
57
|
import hud
|
|
58
58
|
|
|
@@ -100,6 +100,7 @@ async def main():
|
|
|
100
100
|
setup=("goto", "google.com"),
|
|
101
101
|
evaluate=("contains_text", "capybara")
|
|
102
102
|
)
|
|
103
|
+
print(f"Running task with prompt: {task.prompt}")
|
|
103
104
|
|
|
104
105
|
# Create environment using the gym module
|
|
105
106
|
env = await gym.make(task)
|
|
@@ -111,6 +112,7 @@ async def main():
|
|
|
111
112
|
obs, _ = await env.reset() # Gets first observation
|
|
112
113
|
for i in range(5):
|
|
113
114
|
actions, done = await agent.predict(obs)
|
|
115
|
+
print(f"Agent action {i}: {actions}")
|
|
114
116
|
|
|
115
117
|
obs, reward, terminated, info = await env.step(actions)
|
|
116
118
|
if done or terminated: break
|
|
@@ -329,7 +329,7 @@ We strongly encourage community contributions! If you've built a useful custom e
|
|
|
329
329
|
Check the `environments/` directory in the SDK for inspiration:
|
|
330
330
|
- `environments/novnc_ubuntu/`: Provides an Ubuntu desktop accessible via VNC, for GUI-based tasks.
|
|
331
331
|
- `environments/custom_website/`: A template for packaging and testing your own web application.
|
|
332
|
-
- `environments/
|
|
332
|
+
- `environments/pokemon_controller/`: Example of a retro gaming environment.
|
|
333
333
|
|
|
334
334
|
## Using Remote Custom Environments
|
|
335
335
|
|
|
@@ -375,4 +375,4 @@ task_on_remote2 = Task(
|
|
|
375
375
|
|
|
376
376
|
- **[Task Creation](/task-creation)**: How to define tasks that use your custom environments.
|
|
377
377
|
- **[Custom Environments Overview](/environments/custom)**: Higher-level concepts of custom environments.
|
|
378
|
-
- **[Browser Environment](/environments/browser)**: For standard web interaction tasks.
|
|
378
|
+
- **[Browser Environment](/environments/browser)**: For standard web interaction tasks.
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: 'Web Mocks'
|
|
3
|
+
description: 'Clone websites and host them as stable test environments for AI agents using HUD page archives.'
|
|
4
|
+
icon: 'clone'
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Page Cloning
|
|
8
|
+
|
|
9
|
+
This guide demonstrates how to create and host web archives for testing AI agents with consistent, offline-first environments. By cloning websites into WACZ (Web ARChiveZip) files, you can ensure your agents always test against specific, unchanging versions of web pages.
|
|
10
|
+
|
|
11
|
+
**Goal**: Create reproducible web environments for testing browser-based agents without depending on live websites that might change or go offline.
|
|
12
|
+
|
|
13
|
+
**Concepts Covered**:
|
|
14
|
+
- Using ArchiveWeb.page to clone websites into WACZ files
|
|
15
|
+
- Hosting archives locally with the HUD page archives repository and `CustomGym`
|
|
16
|
+
- Uploading archives to app.hud.so for immediate cloud hosting
|
|
17
|
+
- Creating tasks that use these stable archived environments
|
|
18
|
+
|
|
19
|
+
## Prerequisites
|
|
20
|
+
|
|
21
|
+
- HUD SDK installed
|
|
22
|
+
- Docker installed (for local hosting option)
|
|
23
|
+
- ArchiveWeb.page browser extension (for cloning pages)
|
|
24
|
+
- API keys for HUD and your chosen agent
|
|
25
|
+
|
|
26
|
+
## Part 1: Cloning the Page
|
|
27
|
+
|
|
28
|
+
### Installing ArchiveWeb.page
|
|
29
|
+
|
|
30
|
+
1. **Install the Browser Extension**:
|
|
31
|
+
- Visit [ArchiveWeb.page](https://archiveweb.page)
|
|
32
|
+
- Install the extension for Chrome/Chromium-based browsers
|
|
33
|
+
- The extension icon will appear in your browser toolbar
|
|
34
|
+
|
|
35
|
+
2. **Create a New Archive**:
|
|
36
|
+
- Click the ArchiveWeb.page extension icon
|
|
37
|
+
- Click "Create New Collection"
|
|
38
|
+
- Give your collection a descriptive name (e.g., "my-test-site")
|
|
39
|
+
|
|
40
|
+
### Capturing Web Pages
|
|
41
|
+
|
|
42
|
+
1. **Start Archiving**:
|
|
43
|
+
- Click "Start" in the extension popup to begin an archiving session
|
|
44
|
+
- Navigate to the website you want to clone
|
|
45
|
+
- Interact with the site as your agent would (login, navigate through pages, fill forms)
|
|
46
|
+
- All pages and resources will be captured automatically
|
|
47
|
+
|
|
48
|
+
2. **Best Practices for Agent Testing**:
|
|
49
|
+
- Capture all relevant pages and states your agent will interact with
|
|
50
|
+
- Include error pages and edge cases
|
|
51
|
+
- If testing login flows, capture both logged-out and logged-in states
|
|
52
|
+
- For form submissions, capture the form page and success/error pages
|
|
53
|
+
|
|
54
|
+
3. **Stop and Download**:
|
|
55
|
+
- Click "Stop" in the extension when done capturing
|
|
56
|
+
- Click "Download" to save your collection
|
|
57
|
+
- Choose WACZ format (default)
|
|
58
|
+
- Save with a meaningful filename (e.g., `my-test-site.wacz`)
|
|
59
|
+
|
|
60
|
+
### Example: Cloning a Login Flow
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
1. Start archiving session
|
|
64
|
+
2. Visit https://example.com/login
|
|
65
|
+
3. Enter test credentials (e.g., testuser/password123)
|
|
66
|
+
4. Submit the form
|
|
67
|
+
5. Capture the dashboard/welcome page
|
|
68
|
+
6. Optionally capture logout flow
|
|
69
|
+
7. Stop and download as my-test-site.wacz
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Part 2: Hosting the Website
|
|
73
|
+
|
|
74
|
+
You have two options for hosting your archived website:
|
|
75
|
+
|
|
76
|
+
### Option 1: Local Hosting with CustomGym
|
|
77
|
+
|
|
78
|
+
This approach uses the [HUD page archives repository](https://github.com/hud-evals/page-archives) to host archives locally and access them via `CustomGym`.
|
|
79
|
+
|
|
80
|
+
#### Step 1: Clone the Page Archives Repository
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
git clone https://github.com/hud-evals/page-archives.git
|
|
84
|
+
cd page-archives
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
#### Step 2: Add Your Archive
|
|
88
|
+
|
|
89
|
+
1. **Place your WACZ file**:
|
|
90
|
+
```bash
|
|
91
|
+
cp ~/Downloads/my-test-site.wacz archives/
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
2. **Update `archives/archive_list.json`**:
|
|
95
|
+
```json
|
|
96
|
+
{
|
|
97
|
+
"archives": [
|
|
98
|
+
{
|
|
99
|
+
"name": "my-test-site",
|
|
100
|
+
"displayName": "My Test Site Archive",
|
|
101
|
+
"startPage": "https://example.com/login" // Optional: default page to open
|
|
102
|
+
}
|
|
103
|
+
// ... other archives
|
|
104
|
+
]
|
|
105
|
+
}
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Note: The `name` field must match your WACZ filename without the `.wacz` extension.
|
|
109
|
+
|
|
110
|
+
#### Step 3: Create a CustomGym for the Archive Server
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
from hud.types import CustomGym
|
|
114
|
+
from pathlib import Path
|
|
115
|
+
|
|
116
|
+
# Create a Dockerfile for the archive server
|
|
117
|
+
archive_server_dockerfile = """
|
|
118
|
+
FROM node:18-slim
|
|
119
|
+
WORKDIR /app
|
|
120
|
+
COPY . /app
|
|
121
|
+
RUN npm install
|
|
122
|
+
EXPOSE 3000
|
|
123
|
+
CMD ["npm", "run", "start"]
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
# Save Dockerfile in the page-archives directory
|
|
127
|
+
with open("page-archives/Dockerfile", "w") as f:
|
|
128
|
+
f.write(archive_server_dockerfile)
|
|
129
|
+
|
|
130
|
+
# Define the CustomGym
|
|
131
|
+
archive_server_gym = CustomGym(
|
|
132
|
+
location="local",
|
|
133
|
+
image_or_build_context=Path("./page-archives"),
|
|
134
|
+
host_config={
|
|
135
|
+
"port_bindings": {3000: 3000} # Expose port 3000
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
#### Step 4: Create Tasks Using the Archived Site
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from hud import Task, run_job
|
|
144
|
+
from hud.agent import ClaudeAgent
|
|
145
|
+
|
|
146
|
+
# Task to test login flow on the archived site
|
|
147
|
+
login_task = Task(
|
|
148
|
+
prompt="Log into the website using username 'testuser' and password 'password123'.",
|
|
149
|
+
gym="hud-browser", # Use browser to interact
|
|
150
|
+
setup=[
|
|
151
|
+
# Navigate to your archived site running locally
|
|
152
|
+
("goto", "http://localhost:3000/my-test-site")
|
|
153
|
+
],
|
|
154
|
+
evaluate=("page_contains", "Welcome, testuser!")
|
|
155
|
+
)
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
#### Advanced: Query Parameters
|
|
159
|
+
|
|
160
|
+
The archive viewer supports useful query parameters:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
# Open a specific page within the archive
|
|
164
|
+
specific_page_task = Task(
|
|
165
|
+
prompt="Navigate to the user profile page",
|
|
166
|
+
gym="hud-browser",
|
|
167
|
+
setup=[
|
|
168
|
+
("goto", "http://localhost:3000/my-test-site?page=https%3A%2F%2Fexample.com%2Fprofile")
|
|
169
|
+
]
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Debug mode - shows full ReplayWeb.page UI
|
|
173
|
+
debug_task = Task(
|
|
174
|
+
prompt="Explore the archive interface",
|
|
175
|
+
gym="hud-browser",
|
|
176
|
+
setup=[
|
|
177
|
+
("goto", "http://localhost:3000/my-test-site?debug=true")
|
|
178
|
+
]
|
|
179
|
+
)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Option 2: Cloud Hosting on app.hud.so
|
|
183
|
+
|
|
184
|
+
For immediate hosting without local setup, use the HUD platform's built-in page cloning feature.
|
|
185
|
+
|
|
186
|
+
#### Step 1: Access Page Clone Feature
|
|
187
|
+
|
|
188
|
+
1. Go to [app.hud.so](https://app.hud.so)
|
|
189
|
+
2. Click "Create" in the navigation
|
|
190
|
+
3. Select "Page Clone"
|
|
191
|
+
|
|
192
|
+
#### Step 2: Upload Your Archive
|
|
193
|
+
|
|
194
|
+
1. Click "Upload WACZ file"
|
|
195
|
+
2. Select your `.wacz` file created in Part 1
|
|
196
|
+
3. Provide a name for your cloned environment
|
|
197
|
+
4. Click "Create"
|
|
198
|
+
|
|
199
|
+
#### Step 3: Use the Hosted Archive
|
|
200
|
+
|
|
201
|
+
Once uploaded, you'll receive a URL for your hosted archive (e.g., `https://archives.hud.so/your-archive-id`).
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from hud import Task, run_job
|
|
205
|
+
from hud.agent import ClaudeAgent
|
|
206
|
+
|
|
207
|
+
# Task using the cloud-hosted archive
|
|
208
|
+
cloud_login_task = Task(
|
|
209
|
+
prompt="Log into the website using username 'testuser' and password 'password123'.",
|
|
210
|
+
gym="hud-browser",
|
|
211
|
+
setup=[
|
|
212
|
+
# Navigate to your cloud-hosted archive
|
|
213
|
+
("goto", "https://archives.hud.so/your-archive-id")
|
|
214
|
+
],
|
|
215
|
+
evaluate=("page_contains", "Welcome, testuser!")
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Run evaluation
|
|
219
|
+
job = await run_job(
|
|
220
|
+
agent_cls=ClaudeAgent,
|
|
221
|
+
task_or_taskset=cloud_login_task,
|
|
222
|
+
job_name="Cloud Archive Test"
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Tips for Effective Page Cloning
|
|
227
|
+
|
|
228
|
+
1. **Capture Complete Flows**: Don't just capture individual pages - capture entire user journeys
|
|
229
|
+
2. **Include Resources**: Ensure CSS, JavaScript, and images are properly captured
|
|
230
|
+
3. **Test Your Archives**: Always verify your archives work correctly before using them in evaluations
|
|
231
|
+
4. **Document States**: Keep notes on what states and pages are included in each archive
|
|
232
|
+
5. **Update Regularly**: Re-clone sites when significant changes occur
|
|
233
|
+
|
|
234
|
+
## Key Takeaways
|
|
235
|
+
|
|
236
|
+
- ArchiveWeb.page makes it easy to create WACZ archives of any website
|
|
237
|
+
- Local hosting with CustomGym gives you full control and fast performance
|
|
238
|
+
- Cloud hosting on app.hud.so provides instant deployment without infrastructure
|
|
239
|
+
- Page cloning ensures consistent, reproducible testing environments for AI agents
|
|
240
|
+
- Archived sites eliminate external dependencies and enable offline testing
|
|
@@ -30,6 +30,10 @@ task = Task(
|
|
|
30
30
|
setup=("goto", "https://news.example.com"), # Function to run at env.reset()
|
|
31
31
|
evaluate=("page_contains", "artificial intelligence") # Function to run at env.evaluate()
|
|
32
32
|
)
|
|
33
|
+
|
|
34
|
+
# Create environment
|
|
35
|
+
env = gym.make(task)
|
|
36
|
+
# ...
|
|
33
37
|
```
|
|
34
38
|
|
|
35
39
|
## Setup Functions (for `hud-browser`)
|