hud-python 0.2.6__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (171) hide show
  1. {hud_python-0.2.6 → hud_python-0.2.7}/PKG-INFO +9 -6
  2. {hud_python-0.2.6 → hud_python-0.2.7}/README.md +6 -4
  3. {hud_python-0.2.6 → hud_python-0.2.7}/docs/docs.json +1 -0
  4. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environment-creation.mdx +2 -2
  5. hud_python-0.2.7/docs/examples/web-mocks.mdx +240 -0
  6. {hud_python-0.2.6 → hud_python-0.2.7}/docs/task-creation.mdx +4 -0
  7. hud_python-0.2.7/examples/appflowy.ipynb +1552 -0
  8. {hud_python-0.2.6 → hud_python-0.2.7}/examples/mcp_test.ipynb +22 -29
  9. hud_python-0.2.7/examples/sensitive_data.ipynb +89 -0
  10. hud_python-0.2.7/examples/sheetbench_direct_example.ipynb +266 -0
  11. {hud_python-0.2.6 → hud_python-0.2.7}/examples/wordle_example.ipynb +1 -1
  12. {hud_python-0.2.6 → hud_python-0.2.7}/hud/__init__.py +13 -10
  13. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/adapter.py +30 -18
  14. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/adapter.py +0 -1
  15. hud_python-0.2.7/hud/adapters/common/types.py +445 -0
  16. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/adapter.py +23 -13
  17. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/base.py +5 -4
  18. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/claude.py +65 -13
  19. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/claude_plays_pokemon.py +2 -2
  20. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/langchain.py +8 -2
  21. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/operator.py +36 -11
  22. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/tests/test_base.py +2 -2
  23. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/docker_client.py +24 -2
  24. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/environment.py +86 -40
  25. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/local_docker_client.py +50 -4
  26. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/remote_client.py +22 -4
  27. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/remote_docker_client.py +6 -2
  28. {hud_python-0.2.6 → hud_python-0.2.7}/hud/gym.py +15 -4
  29. {hud_python-0.2.6 → hud_python-0.2.7}/hud/job.py +91 -26
  30. {hud_python-0.2.6 → hud_python-0.2.7}/hud/settings.py +6 -0
  31. {hud_python-0.2.6 → hud_python-0.2.7}/hud/task.py +84 -6
  32. {hud_python-0.2.6 → hud_python-0.2.7}/hud/taskset.py +63 -8
  33. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/exporter.py +4 -6
  34. {hud_python-0.2.6 → hud_python-0.2.7}/hud/trajectory.py +3 -0
  35. {hud_python-0.2.6 → hud_python-0.2.7}/hud/types.py +28 -2
  36. hud_python-0.2.7/hud/utils/agent.py +37 -0
  37. hud_python-0.2.7/hud/utils/common.py +256 -0
  38. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/config.py +11 -0
  39. hud_python-0.2.7/hud/utils/tests/test_common.py +277 -0
  40. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_version.py +1 -1
  41. {hud_python-0.2.6 → hud_python-0.2.7}/hud/version.py +1 -1
  42. {hud_python-0.2.6 → hud_python-0.2.7}/pyproject.toml +6 -4
  43. hud_python-0.2.6/hud/adapters/common/types.py +0 -320
  44. hud_python-0.2.6/hud/utils/common.py +0 -140
  45. hud_python-0.2.6/hud/utils/tests/test_common.py +0 -52
  46. {hud_python-0.2.6 → hud_python-0.2.7}/.env.example +0 -0
  47. {hud_python-0.2.6 → hud_python-0.2.7}/.github/workflows/ci.yml +0 -0
  48. {hud_python-0.2.6 → hud_python-0.2.7}/.github/workflows/release.yml +0 -0
  49. {hud_python-0.2.6 → hud_python-0.2.7}/.gitignore +0 -0
  50. {hud_python-0.2.6 → hud_python-0.2.7}/LICENSE +0 -0
  51. {hud_python-0.2.6 → hud_python-0.2.7}/MANIFEST.in +0 -0
  52. {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/cla-details.mdx +0 -0
  53. {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/environment-control.mdx +0 -0
  54. {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/tracing.mdx +0 -0
  55. {hud_python-0.2.6 → hud_python-0.2.7}/docs/advanced/uploading.mdx +0 -0
  56. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/adapters.mdx +0 -0
  57. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/env.mdx +0 -0
  58. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/gym.mdx +0 -0
  59. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/job.mdx +0 -0
  60. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/task.mdx +0 -0
  61. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/taskset.mdx +0 -0
  62. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/telemetry.mdx +0 -0
  63. {hud_python-0.2.6 → hud_python-0.2.7}/docs/api-reference/trajectory.mdx +0 -0
  64. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/adapter.mdx +0 -0
  65. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/agent.mdx +0 -0
  66. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/environment.mdx +0 -0
  67. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/job.mdx +0 -0
  68. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/task.mdx +0 -0
  69. {hud_python-0.2.6 → hud_python-0.2.7}/docs/concepts/trajectory.mdx +0 -0
  70. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/browser.mdx +0 -0
  71. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/custom-environments.mdx +0 -0
  72. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/custom.mdx +0 -0
  73. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/osworld-ubuntu.mdx +0 -0
  74. {hud_python-0.2.6 → hud_python-0.2.7}/docs/environments/qa.mdx +0 -0
  75. {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/alignment-evaluation.mdx +0 -0
  76. {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/benchmarking-agents.mdx +0 -0
  77. {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/custom-os-env.mdx +0 -0
  78. {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/mcp-agent-tracing.mdx +0 -0
  79. {hud_python-0.2.6 → hud_python-0.2.7}/docs/examples/web-app-testing.mdx +0 -0
  80. {hud_python-0.2.6 → hud_python-0.2.7}/docs/favicon.png +0 -0
  81. {hud_python-0.2.6 → hud_python-0.2.7}/docs/logo/hud_logo.svg +0 -0
  82. {hud_python-0.2.6 → hud_python-0.2.7}/docs/logo/hud_logo_dark.svg +0 -0
  83. {hud_python-0.2.6 → hud_python-0.2.7}/docs/quickstart.mdx +0 -0
  84. {hud_python-0.2.6 → hud_python-0.2.7}/docs/running-your-agent.mdx +0 -0
  85. {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/Dockerfile +0 -0
  86. {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/pyproject.toml +0 -0
  87. {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/__init__.py +0 -0
  88. {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/pyautogui_rosetta.py +0 -0
  89. {hud_python-0.2.6 → hud_python-0.2.7}/environments/novnc_ubuntu/src/hud_controller/step.py +0 -0
  90. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/Dockerfile +0 -0
  91. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/pyproject.toml +0 -0
  92. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/__init__.py +0 -0
  93. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/display_adapters.py +0 -0
  94. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/emulator.py +0 -0
  95. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/evaluator.py +0 -0
  96. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/kill.py +0 -0
  97. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/main.py +0 -0
  98. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/setup.py +0 -0
  99. {hud_python-0.2.6 → hud_python-0.2.7}/environments/pokemon_controller/src/hud_controller/step.py +0 -0
  100. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/Dockerfile +0 -0
  101. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/pyproject.toml +0 -0
  102. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/__init__.py +0 -0
  103. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/evaluate/__init__.py +0 -0
  104. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/evaluate/matchers.py +0 -0
  105. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/info.py +0 -0
  106. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/setup/__init__.py +0 -0
  107. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/setup/question.py +0 -0
  108. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/step.py +0 -0
  109. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/utils/__init__.py +0 -0
  110. {hud_python-0.2.6 → hud_python-0.2.7}/environments/qa_controller/src/hud_controller/utils/state.py +0 -0
  111. {hud_python-0.2.6 → hud_python-0.2.7}/examples/README.md +0 -0
  112. {hud_python-0.2.6 → hud_python-0.2.7}/examples/browser_use.ipynb +0 -0
  113. {hud_python-0.2.6 → hud_python-0.2.7}/examples/custom_task_example.ipynb +0 -0
  114. {hud_python-0.2.6 → hud_python-0.2.7}/examples/jobs.ipynb +0 -0
  115. {hud_python-0.2.6 → hud_python-0.2.7}/examples/local.ipynb +0 -0
  116. {hud_python-0.2.6 → hud_python-0.2.7}/examples/osworld.ipynb +0 -0
  117. {hud_python-0.2.6 → hud_python-0.2.7}/examples/pokemon_local.ipynb +0 -0
  118. {hud_python-0.2.6 → hud_python-0.2.7}/examples/pokemon_remote.ipynb +0 -0
  119. {hud_python-0.2.6 → hud_python-0.2.7}/examples/remote.ipynb +0 -0
  120. {hud_python-0.2.6 → hud_python-0.2.7}/examples/tasks.ipynb +0 -0
  121. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/__init__.py +0 -0
  122. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/__init__.py +0 -0
  123. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/tests/__init__.py +0 -0
  124. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/claude/tests/test_adapter.py +0 -0
  125. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/__init__.py +0 -0
  126. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/tests/__init__.py +0 -0
  127. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/common/tests/test_adapter.py +0 -0
  128. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/__init__.py +0 -0
  129. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/tests/__init__.py +0 -0
  130. {hud_python-0.2.6 → hud_python-0.2.7}/hud/adapters/operator/tests/test_adapter.py +0 -0
  131. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/__init__.py +0 -0
  132. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/misc/__init__.py +0 -0
  133. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/misc/response_agent.py +0 -0
  134. {hud_python-0.2.6 → hud_python-0.2.7}/hud/agent/tests/__init__.py +0 -0
  135. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/__init__.py +0 -0
  136. {hud_python-0.2.6 → hud_python-0.2.7}/hud/env/client.py +0 -0
  137. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/__init__.py +0 -0
  138. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/base.py +0 -0
  139. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/inspect.py +0 -0
  140. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/judge.py +0 -0
  141. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/match.py +0 -0
  142. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/remote.py +0 -0
  143. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/__init__.py +0 -0
  144. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_inspect.py +0 -0
  145. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_judge.py +0 -0
  146. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_match.py +0 -0
  147. {hud_python-0.2.6 → hud_python-0.2.7}/hud/evaluators/tests/test_remote.py +0 -0
  148. {hud_python-0.2.6 → hud_python-0.2.7}/hud/exceptions.py +0 -0
  149. {hud_python-0.2.6 → hud_python-0.2.7}/hud/py.typed +0 -0
  150. {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/__init__.py +0 -0
  151. {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/requests.py +0 -0
  152. {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/tests/__init__.py +0 -0
  153. {hud_python-0.2.6 → hud_python-0.2.7}/hud/server/tests/test_requests.py +0 -0
  154. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/__init__.py +0 -0
  155. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/_trace.py +0 -0
  156. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/context.py +0 -0
  157. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/__init__.py +0 -0
  158. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/mcp.py +0 -0
  159. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/instrumentation/registry.py +0 -0
  160. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/mcp_models.py +0 -0
  161. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/__init__.py +0 -0
  162. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/test_context.py +0 -0
  163. {hud_python-0.2.6 → hud_python-0.2.7}/hud/telemetry/tests/test_trace.py +0 -0
  164. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/__init__.py +0 -0
  165. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/misc.py +0 -0
  166. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/progress.py +0 -0
  167. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/telemetry.py +0 -0
  168. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/__init__.py +0 -0
  169. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_config.py +0 -0
  170. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_progress.py +0 -0
  171. {hud_python-0.2.6 → hud_python-0.2.7}/hud/utils/tests/test_telemetry.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: SDK for the HUD evaluation platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-sdk
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-sdk/issues
@@ -47,6 +47,7 @@ Requires-Dist: langchain-openai
47
47
  Requires-Dist: mcp
48
48
  Requires-Dist: numpy
49
49
  Requires-Dist: openai
50
+ Requires-Dist: pathspec>=0.12.1
50
51
  Requires-Dist: pillow>=11.1.0
51
52
  Requires-Dist: pydantic-settings<3,>=2
52
53
  Requires-Dist: pydantic<3,>=2
@@ -61,7 +62,7 @@ Requires-Dist: ipython<9; extra == 'dev'
61
62
  Requires-Dist: jupyter-client; extra == 'dev'
62
63
  Requires-Dist: jupyter-core; extra == 'dev'
63
64
  Requires-Dist: openai; extra == 'dev'
64
- Requires-Dist: pyright==1.1.364; extra == 'dev'
65
+ Requires-Dist: pyright==1.1.401; extra == 'dev'
65
66
  Requires-Dist: pytest-asyncio; extra == 'dev'
66
67
  Requires-Dist: pytest-cov; extra == 'dev'
67
68
  Requires-Dist: pytest-mock; extra == 'dev'
@@ -90,7 +91,7 @@ We're here to help with eval strategies, custom environments, or improving your
90
91
 
91
92
  ## ✨ What You Can Do
92
93
 
93
- **Evaluate Existing Benchmarks**
94
+ **[Evaluate Existing Benchmarks](https://docs.hud.so/examples/benchmarking-agents)**
94
95
  ```python
95
96
  from hud import load_taskset, run_job, ClaudeAgent
96
97
 
@@ -98,7 +99,7 @@ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
98
99
  job = await run_job(ClaudeAgent, taskset, "my-evaluation")
99
100
  ```
100
101
 
101
- **Create Custom Tasks**
102
+ **[Create Custom Tasks](https://docs.hud.so/task-creation)**
102
103
  ```python
103
104
  from hud.task import Task
104
105
 
@@ -110,7 +111,7 @@ task = Task(
110
111
  )
111
112
  ```
112
113
 
113
- **Build Custom Environments**
114
+ **[Build Custom Environments](https://docs.hud.so/environment-creation)**
114
115
  ```python
115
116
  from hud.types import CustomGym
116
117
 
@@ -123,7 +124,7 @@ custom_gym = CustomGym(
123
124
  # Or create complex Docker environments - see environments/ folder for examples
124
125
  ```
125
126
 
126
- **Trace Tool Calls Alongside HUD Environments (or Independently)**
127
+ **[Trace Tool Calls Alongside HUD Environments (or Independently)](https://docs.hud.so/examples/mcp-agent-tracing)**
127
128
  ```python
128
129
  import hud
129
130
 
@@ -171,6 +172,7 @@ async def main():
171
172
  setup=("goto", "google.com"),
172
173
  evaluate=("contains_text", "capybara")
173
174
  )
175
+ print(f"Running task with prompt: {task.prompt}")
174
176
 
175
177
  # Create environment using the gym module
176
178
  env = await gym.make(task)
@@ -182,6 +184,7 @@ async def main():
182
184
  obs, _ = await env.reset() # Gets first observation
183
185
  for i in range(5):
184
186
  actions, done = await agent.predict(obs)
187
+ print(f"Agent action {i}: {actions}")
185
188
 
186
189
  obs, reward, terminated, info = await env.step(actions)
187
190
  if done or terminated: break
@@ -19,7 +19,7 @@ We're here to help with eval strategies, custom environments, or improving your
19
19
 
20
20
  ## ✨ What You Can Do
21
21
 
22
- **Evaluate Existing Benchmarks**
22
+ **[Evaluate Existing Benchmarks](https://docs.hud.so/examples/benchmarking-agents)**
23
23
  ```python
24
24
  from hud import load_taskset, run_job, ClaudeAgent
25
25
 
@@ -27,7 +27,7 @@ taskset = await load_taskset("WebVoyager") # or GAIA, OSWorld-Ubuntu, Mind2Web
27
27
  job = await run_job(ClaudeAgent, taskset, "my-evaluation")
28
28
  ```
29
29
 
30
- **Create Custom Tasks**
30
+ **[Create Custom Tasks](https://docs.hud.so/task-creation)**
31
31
  ```python
32
32
  from hud.task import Task
33
33
 
@@ -39,7 +39,7 @@ task = Task(
39
39
  )
40
40
  ```
41
41
 
42
- **Build Custom Environments**
42
+ **[Build Custom Environments](https://docs.hud.so/environment-creation)**
43
43
  ```python
44
44
  from hud.types import CustomGym
45
45
 
@@ -52,7 +52,7 @@ custom_gym = CustomGym(
52
52
  # Or create complex Docker environments - see environments/ folder for examples
53
53
  ```
54
54
 
55
- **Trace Tool Calls Alongside HUD Environments (or Independently)**
55
+ **[Trace Tool Calls Alongside HUD Environments (or Independently)](https://docs.hud.so/examples/mcp-agent-tracing)**
56
56
  ```python
57
57
  import hud
58
58
 
@@ -100,6 +100,7 @@ async def main():
100
100
  setup=("goto", "google.com"),
101
101
  evaluate=("contains_text", "capybara")
102
102
  )
103
+ print(f"Running task with prompt: {task.prompt}")
103
104
 
104
105
  # Create environment using the gym module
105
106
  env = await gym.make(task)
@@ -111,6 +112,7 @@ async def main():
111
112
  obs, _ = await env.reset() # Gets first observation
112
113
  for i in range(5):
113
114
  actions, done = await agent.predict(obs)
115
+ print(f"Agent action {i}: {actions}")
114
116
 
115
117
  obs, reward, terminated, info = await env.step(actions)
116
118
  if done or terminated: break
@@ -24,6 +24,7 @@
24
24
  "pages": [
25
25
  "examples/benchmarking-agents",
26
26
  "examples/alignment-evaluation",
27
+ "examples/web-mocks",
27
28
  "examples/custom-os-env",
28
29
  "examples/mcp-agent-tracing",
29
30
  "examples/web-app-testing"
@@ -329,7 +329,7 @@ We strongly encourage community contributions! If you've built a useful custom e
329
329
  Check the `environments/` directory in the SDK for inspiration:
330
330
  - `environments/novnc_ubuntu/`: Provides an Ubuntu desktop accessible via VNC, for GUI-based tasks.
331
331
  - `environments/custom_website/`: A template for packaging and testing your own web application.
332
- - `environments/gameboy/`: Example of a retro gaming environment.
332
+ - `environments/pokemon_controller/`: Example of a retro gaming environment.
333
333
 
334
334
  ## Using Remote Custom Environments
335
335
 
@@ -375,4 +375,4 @@ task_on_remote2 = Task(
375
375
 
376
376
  - **[Task Creation](/task-creation)**: How to define tasks that use your custom environments.
377
377
  - **[Custom Environments Overview](/environments/custom)**: Higher-level concepts of custom environments.
378
- - **[Browser Environment](/environments/browser)**: For standard web interaction tasks.
378
+ - **[Browser Environment](/environments/browser)**: For standard web interaction tasks.
@@ -0,0 +1,240 @@
1
+ ---
2
+ title: 'Web Mocks'
3
+ description: 'Clone websites and host them as stable test environments for AI agents using HUD page archives.'
4
+ icon: 'clone'
5
+ ---
6
+
7
+ # Page Cloning
8
+
9
+ This guide demonstrates how to create and host web archives for testing AI agents with consistent, offline-first environments. By cloning websites into WACZ (Web ARChiveZip) files, you can ensure your agents always test against specific, unchanging versions of web pages.
10
+
11
+ **Goal**: Create reproducible web environments for testing browser-based agents without depending on live websites that might change or go offline.
12
+
13
+ **Concepts Covered**:
14
+ - Using ArchiveWeb.page to clone websites into WACZ files
15
+ - Hosting archives locally with the HUD page archives repository and `CustomGym`
16
+ - Uploading archives to app.hud.so for immediate cloud hosting
17
+ - Creating tasks that use these stable archived environments
18
+
19
+ ## Prerequisites
20
+
21
+ - HUD SDK installed
22
+ - Docker installed (for local hosting option)
23
+ - ArchiveWeb.page browser extension (for cloning pages)
24
+ - API keys for HUD and your chosen agent
25
+
26
+ ## Part 1: Cloning the Page
27
+
28
+ ### Installing ArchiveWeb.page
29
+
30
+ 1. **Install the Browser Extension**:
31
+ - Visit [ArchiveWeb.page](https://archiveweb.page)
32
+ - Install the extension for Chrome/Chromium-based browsers
33
+ - The extension icon will appear in your browser toolbar
34
+
35
+ 2. **Create a New Archive**:
36
+ - Click the ArchiveWeb.page extension icon
37
+ - Click "Create New Collection"
38
+ - Give your collection a descriptive name (e.g., "my-test-site")
39
+
40
+ ### Capturing Web Pages
41
+
42
+ 1. **Start Archiving**:
43
+ - Click "Start" in the extension popup to begin an archiving session
44
+ - Navigate to the website you want to clone
45
+ - Interact with the site as your agent would (login, navigate through pages, fill forms)
46
+ - All pages and resources will be captured automatically
47
+
48
+ 2. **Best Practices for Agent Testing**:
49
+ - Capture all relevant pages and states your agent will interact with
50
+ - Include error pages and edge cases
51
+ - If testing login flows, capture both logged-out and logged-in states
52
+ - For form submissions, capture the form page and success/error pages
53
+
54
+ 3. **Stop and Download**:
55
+ - Click "Stop" in the extension when done capturing
56
+ - Click "Download" to save your collection
57
+ - Choose WACZ format (default)
58
+ - Save with a meaningful filename (e.g., `my-test-site.wacz`)
59
+
60
+ ### Example: Cloning a Login Flow
61
+
62
+ ```
63
+ 1. Start archiving session
64
+ 2. Visit https://example.com/login
65
+ 3. Enter test credentials (e.g., testuser/password123)
66
+ 4. Submit the form
67
+ 5. Capture the dashboard/welcome page
68
+ 6. Optionally capture logout flow
69
+ 7. Stop and download as my-test-site.wacz
70
+ ```
71
+
72
+ ## Part 2: Hosting the Website
73
+
74
+ You have two options for hosting your archived website:
75
+
76
+ ### Option 1: Local Hosting with CustomGym
77
+
78
+ This approach uses the [HUD page archives repository](https://github.com/hud-evals/page-archives) to host archives locally and access them via `CustomGym`.
79
+
80
+ #### Step 1: Clone the Page Archives Repository
81
+
82
+ ```bash
83
+ git clone https://github.com/hud-evals/page-archives.git
84
+ cd page-archives
85
+ ```
86
+
87
+ #### Step 2: Add Your Archive
88
+
89
+ 1. **Place your WACZ file**:
90
+ ```bash
91
+ cp ~/Downloads/my-test-site.wacz archives/
92
+ ```
93
+
94
+ 2. **Update `archives/archive_list.json`**:
95
+ ```json
96
+ {
97
+ "archives": [
98
+ {
99
+ "name": "my-test-site",
100
+ "displayName": "My Test Site Archive",
101
+ "startPage": "https://example.com/login" // Optional: default page to open
102
+ }
103
+ // ... other archives
104
+ ]
105
+ }
106
+ ```
107
+
108
+ Note: The `name` field must match your WACZ filename without the `.wacz` extension.
109
+
110
+ #### Step 3: Create a CustomGym for the Archive Server
111
+
112
+ ```python
113
+ from hud.types import CustomGym
114
+ from pathlib import Path
115
+
116
+ # Create a Dockerfile for the archive server
117
+ archive_server_dockerfile = """
118
+ FROM node:18-slim
119
+ WORKDIR /app
120
+ COPY . /app
121
+ RUN npm install
122
+ EXPOSE 3000
123
+ CMD ["npm", "run", "start"]
124
+ """
125
+
126
+ # Save Dockerfile in the page-archives directory
127
+ with open("page-archives/Dockerfile", "w") as f:
128
+ f.write(archive_server_dockerfile)
129
+
130
+ # Define the CustomGym
131
+ archive_server_gym = CustomGym(
132
+ location="local",
133
+ image_or_build_context=Path("./page-archives"),
134
+ host_config={
135
+ "port_bindings": {3000: 3000} # Expose port 3000
136
+ }
137
+ )
138
+ ```
139
+
140
+ #### Step 4: Create Tasks Using the Archived Site
141
+
142
+ ```python
143
+ from hud import Task, run_job
144
+ from hud.agent import ClaudeAgent
145
+
146
+ # Task to test login flow on the archived site
147
+ login_task = Task(
148
+ prompt="Log into the website using username 'testuser' and password 'password123'.",
149
+ gym="hud-browser", # Use browser to interact
150
+ setup=[
151
+ # Navigate to your archived site running locally
152
+ ("goto", "http://localhost:3000/my-test-site")
153
+ ],
154
+ evaluate=("page_contains", "Welcome, testuser!")
155
+ )
156
+ ```
157
+
158
+ #### Advanced: Query Parameters
159
+
160
+ The archive viewer supports useful query parameters:
161
+
162
+ ```python
163
+ # Open a specific page within the archive
164
+ specific_page_task = Task(
165
+ prompt="Navigate to the user profile page",
166
+ gym="hud-browser",
167
+ setup=[
168
+ ("goto", "http://localhost:3000/my-test-site?page=https%3A%2F%2Fexample.com%2Fprofile")
169
+ ]
170
+ )
171
+
172
+ # Debug mode - shows full ReplayWeb.page UI
173
+ debug_task = Task(
174
+ prompt="Explore the archive interface",
175
+ gym="hud-browser",
176
+ setup=[
177
+ ("goto", "http://localhost:3000/my-test-site?debug=true")
178
+ ]
179
+ )
180
+ ```
181
+
182
+ ### Option 2: Cloud Hosting on app.hud.so
183
+
184
+ For immediate hosting without local setup, use the HUD platform's built-in page cloning feature.
185
+
186
+ #### Step 1: Access Page Clone Feature
187
+
188
+ 1. Go to [app.hud.so](https://app.hud.so)
189
+ 2. Click "Create" in the navigation
190
+ 3. Select "Page Clone"
191
+
192
+ #### Step 2: Upload Your Archive
193
+
194
+ 1. Click "Upload WACZ file"
195
+ 2. Select your `.wacz` file created in Part 1
196
+ 3. Provide a name for your cloned environment
197
+ 4. Click "Create"
198
+
199
+ #### Step 3: Use the Hosted Archive
200
+
201
+ Once uploaded, you'll receive a URL for your hosted archive (e.g., `https://archives.hud.so/your-archive-id`).
202
+
203
+ ```python
204
+ from hud import Task, run_job
205
+ from hud.agent import ClaudeAgent
206
+
207
+ # Task using the cloud-hosted archive
208
+ cloud_login_task = Task(
209
+ prompt="Log into the website using username 'testuser' and password 'password123'.",
210
+ gym="hud-browser",
211
+ setup=[
212
+ # Navigate to your cloud-hosted archive
213
+ ("goto", "https://archives.hud.so/your-archive-id")
214
+ ],
215
+ evaluate=("page_contains", "Welcome, testuser!")
216
+ )
217
+
218
+ # Run evaluation
219
+ job = await run_job(
220
+ agent_cls=ClaudeAgent,
221
+ task_or_taskset=cloud_login_task,
222
+ job_name="Cloud Archive Test"
223
+ )
224
+ ```
225
+
226
+ ## Tips for Effective Page Cloning
227
+
228
+ 1. **Capture Complete Flows**: Don't just capture individual pages - capture entire user journeys
229
+ 2. **Include Resources**: Ensure CSS, JavaScript, and images are properly captured
230
+ 3. **Test Your Archives**: Always verify your archives work correctly before using them in evaluations
231
+ 4. **Document States**: Keep notes on what states and pages are included in each archive
232
+ 5. **Update Regularly**: Re-clone sites when significant changes occur
233
+
234
+ ## Key Takeaways
235
+
236
+ - ArchiveWeb.page makes it easy to create WACZ archives of any website
237
+ - Local hosting with CustomGym gives you full control and fast performance
238
+ - Cloud hosting on app.hud.so provides instant deployment without infrastructure
239
+ - Page cloning ensures consistent, reproducible testing environments for AI agents
240
+ - Archived sites eliminate external dependencies and enable offline testing
@@ -30,6 +30,10 @@ task = Task(
30
30
  setup=("goto", "https://news.example.com"), # Function to run at env.reset()
31
31
  evaluate=("page_contains", "artificial intelligence") # Function to run at env.evaluate()
32
32
  )
33
+
34
+ # Create environment
35
+ env = gym.make(task)
36
+ # ...
33
37
  ```
34
38
 
35
39
  ## Setup Functions (for `hud-browser`)