PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud_python-0.4.45.dist-info/METADATA DELETED Viewed

@@ -1,552 +0,0 @@
-Metadata-Version: 2.4
-Name: hud-python
-Version: 0.4.45
-Summary: SDK for the HUD platform.
-Project-URL: Homepage, https://github.com/hud-evals/hud-python
-Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
-Project-URL: Documentation, https://docs.hud.so
-Author-email: HUD SDK <founders@hud.so>
-License: MIT License
-        Copyright (c) 2025 Human Union Data, Inc
-        Permission is hereby granted, free of charge, to any person obtaining a copy
-        of this software and associated documentation files (the "Software"), to deal
-        in the Software without restriction, including without limitation the rights
-        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-        copies of the Software, and to permit persons to whom the Software is
-        furnished to do so, subject to the following conditions:
-        The above copyright notice and this permission notice shall be included in all
-        copies or substantial portions of the Software.
-        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-        SOFTWARE.
-License-File: LICENSE
-Classifier: Development Status :: 4 - Beta
-Classifier: Intended Audience :: Developers
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.11
-Classifier: Programming Language :: Python :: 3.12
-Classifier: Programming Language :: Python :: 3.13
-Requires-Python: <3.13,>=3.11
-Requires-Dist: anthropic
-Requires-Dist: blessed>=1.20.0
-Requires-Dist: datasets>=2.14.0
-Requires-Dist: httpx<1,>=0.23.0
-Requires-Dist: hud-fastmcp-python-sdk>=0.1.2
-Requires-Dist: hud-mcp-python-sdk>=3.13.2
-Requires-Dist: hud-mcp-use-python-sdk==2.3.20
-Requires-Dist: numpy>=1.24.0
-Requires-Dist: openai
-Requires-Dist: opentelemetry-api>=1.34.1
-Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.34.1
-Requires-Dist: opentelemetry-instrumentation-mcp==0.47.0
-Requires-Dist: opentelemetry-sdk>=1.34.1
-Requires-Dist: pathspec>=0.12.1
-Requires-Dist: pillow>=11.1.0
-Requires-Dist: prompt-toolkit==3.0.51
-Requires-Dist: pydantic-settings<3,>=2.2
-Requires-Dist: pydantic<3,>=2.6
-Requires-Dist: questionary==2.1.0
-Requires-Dist: rich>=13.0.0
-Requires-Dist: toml>=0.10.2
-Requires-Dist: typer>=0.9.0
-Requires-Dist: watchfiles>=0.21.0
-Requires-Dist: wrapt>=1.14.0
-Provides-Extra: agent
-Requires-Dist: aiodocker>=0.24.0; extra == 'agent'
-Requires-Dist: dotenv>=0.9.9; extra == 'agent'
-Requires-Dist: inspect-ai>=0.3.80; extra == 'agent'
-Requires-Dist: ipykernel; extra == 'agent'
-Requires-Dist: ipython<9; extra == 'agent'
-Requires-Dist: jupyter-client; extra == 'agent'
-Requires-Dist: jupyter-core; extra == 'agent'
-Requires-Dist: langchain; extra == 'agent'
-Requires-Dist: langchain-anthropic; extra == 'agent'
-Requires-Dist: langchain-openai; extra == 'agent'
-Requires-Dist: litellm>=1.55.0; extra == 'agent'
-Requires-Dist: pillow>=11.1.0; extra == 'agent'
-Requires-Dist: playwright; extra == 'agent'
-Requires-Dist: pyautogui>=0.9.54; extra == 'agent'
-Requires-Dist: pyright==1.1.401; extra == 'agent'
-Requires-Dist: pytest-asyncio; extra == 'agent'
-Requires-Dist: pytest-cov; extra == 'agent'
-Requires-Dist: pytest-mock; extra == 'agent'
-Requires-Dist: pytest<9,>=8.1.1; extra == 'agent'
-Requires-Dist: ruff>=0.11.8; extra == 'agent'
-Requires-Dist: setuptools; extra == 'agent'
-Requires-Dist: textdistance<5,>=4.5.0; extra == 'agent'
-Provides-Extra: agents
-Requires-Dist: aiodocker>=0.24.0; extra == 'agents'
-Requires-Dist: dotenv>=0.9.9; extra == 'agents'
-Requires-Dist: inspect-ai>=0.3.80; extra == 'agents'
-Requires-Dist: ipykernel; extra == 'agents'
-Requires-Dist: ipython<9; extra == 'agents'
-Requires-Dist: jupyter-client; extra == 'agents'
-Requires-Dist: jupyter-core; extra == 'agents'
-Requires-Dist: langchain; extra == 'agents'
-Requires-Dist: langchain-anthropic; extra == 'agents'
-Requires-Dist: langchain-openai; extra == 'agents'
-Requires-Dist: litellm>=1.55.0; extra == 'agents'
-Requires-Dist: pillow>=11.1.0; extra == 'agents'
-Requires-Dist: playwright; extra == 'agents'
-Requires-Dist: pyautogui>=0.9.54; extra == 'agents'
-Requires-Dist: pyright==1.1.401; extra == 'agents'
-Requires-Dist: pytest-asyncio; extra == 'agents'
-Requires-Dist: pytest-cov; extra == 'agents'
-Requires-Dist: pytest-mock; extra == 'agents'
-Requires-Dist: pytest<9,>=8.1.1; extra == 'agents'
-Requires-Dist: ruff>=0.11.8; extra == 'agents'
-Requires-Dist: setuptools; extra == 'agents'
-Requires-Dist: textdistance<5,>=4.5.0; extra == 'agents'
-Provides-Extra: dev
-Requires-Dist: aiodocker>=0.24.0; extra == 'dev'
-Requires-Dist: dotenv>=0.9.9; extra == 'dev'
-Requires-Dist: inspect-ai>=0.3.80; extra == 'dev'
-Requires-Dist: ipykernel; extra == 'dev'
-Requires-Dist: ipython<9; extra == 'dev'
-Requires-Dist: jupyter-client; extra == 'dev'
-Requires-Dist: jupyter-core; extra == 'dev'
-Requires-Dist: langchain; extra == 'dev'
-Requires-Dist: langchain-anthropic; extra == 'dev'
-Requires-Dist: langchain-openai; extra == 'dev'
-Requires-Dist: litellm>=1.55.0; extra == 'dev'
-Requires-Dist: pillow>=11.1.0; extra == 'dev'
-Requires-Dist: playwright; extra == 'dev'
-Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
-Requires-Dist: pyright==1.1.401; extra == 'dev'
-Requires-Dist: pytest-asyncio; extra == 'dev'
-Requires-Dist: pytest-cov; extra == 'dev'
-Requires-Dist: pytest-mock; extra == 'dev'
-Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
-Requires-Dist: ruff>=0.11.8; extra == 'dev'
-Requires-Dist: setuptools; extra == 'dev'
-Requires-Dist: textdistance<5,>=4.5.0; extra == 'dev'
-Provides-Extra: rl
-Requires-Dist: bitsandbytes>=0.41.0; (sys_platform == 'linux') and extra == 'rl'
-Requires-Dist: liger-kernel>=0.5.0; (sys_platform == 'linux') and extra == 'rl'
-Requires-Dist: peft>=0.17.1; extra == 'rl'
-Requires-Dist: vllm==0.10.1.1; extra == 'rl'
-Description-Content-Type: text/markdown
-<div align="left">
-  <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo_dark.svg">
-    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg">
-    <img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 24px;"/>
-  </picture>
-</div>
-OSS RL environment + evals toolkit. Wrap software as environments, run benchmarks, and train with RL – locally or at scale.
-[![PyPI version](https://img.shields.io/pypi/v/hud-python?style=flat-square)](https://pypi.org/project/hud-python/)
-[![License](https://img.shields.io/badge/license-MIT-green?style=flat-square)](LICENSE)
-[![Add docs to Cursor](https://img.shields.io/badge/Add%20docs%20to-Cursor-black?style=flat-square)](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLnNvL21jcCJ9)
-[![Discord](https://img.shields.io/discord/1327447144772407390?label=Discord&logo=discord&style=flat-square)](https://discord.gg/wkjtmHYYjm)
-[![X Follow](https://img.shields.io/twitter/follow/hud_evals?style=social)](https://x.com/intent/user?screen_name=hud_evals)
-[![Shop](https://img.shields.io/badge/_-white.svg?label=shop&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAJCAYAAAAywQxIAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAACxMAAAsTAQCanBgAAAF6SURBVChTlZA9ixNhFIWf8yaTpFHRRMXCKpAZhCAYFvwoLHZhwUKw9A9YCJb+Bq0sxGbBQrTxX1j41dvIRAjGZbdwRUUGIzPMeyw2swS3WZ/ynHvP5VylafoAWAd+5Xm+wX+SpukmcMf29RDCZrD9BViz3f53+CjYngKZpD5A2/Y7SQBMJpOkKIprdV1vdzqdHzHGblmW9Ww2+5pl2TmAxWKxmM/nP8fj8cmqqtZijJ9sb0u6ABBWjh0riuIt8CqE8LGu66e2d5MkeQ8QY3xme7fb7T4ZjUbrZVl+jjFuSXoEXGxCDgIl9WzfAO5LSmzvNB771R6vzG4Bx0MIt/M8vwV8aLyDQNt70+n0G1AspaTxVln+aghQluVsKbvxVysflT9NQK/XO7R/SGiQ9Nt2aftElmWXJd1kv0kbeANQVdWl4XB4XtJouXaqNRgMHkrqS+r0+/3XwD1JXdungRfAVWBi+6WkK8D3EMJz22cl3W21WgNgx3YAzvwFd0Chdq03gKUAAAAASUVORK5CYII=&style=social)](https://shop.hud.so)
-### Are you a startup building agents?
-[📅 Hop on a call](https://cal.com/jay-ram-z6st6w/demo) or [📧 founders@hud.so](mailto:founders@hud.so)
-## Highlights
-- 🎓 **[One-click RL](https://hud.so/models)** – Run `hud rl` to get a trained model on any environment.
-- 🚀 **[MCP environment skeleton](https://docs.hud.so/core-concepts/mcp-protocol)** – any agent can call any environment.
-- ⚡️ **[Live telemetry](https://hud.so)** – inspect every tool call, observation, and reward in real time.
-- 🗂️ **[Public benchmarks](https://hud.so/leaderboards)** – OSWorld-Verified, SheetBench-50, and more.
-- 🌐 **[Cloud browsers](environments/remote_browser/)** – AnchorBrowser, Steel, BrowserBase integrations for browser automation.
-- 🛠️ **[Hot-reload dev loop](environments/README.md#phase-5-hot-reload-development-with-cursor-agent)** – `hud dev` for iterating on environments without rebuilds.
-> We welcome contributors and feature requests – open an issue or hop on a call to discuss improvements!
-## Installation
-```bash
-# SDK - MCP servers, telemetry, evaluation
-pip install hud-python
-# CLI - RL pipeline, environment design
-uv tool install hud-python
-# uv tool update-shell
-```
-> See [docs.hud.so](https://docs.hud.so), or add docs to any MCP client:
-> `claude mcp add --transport http docs-hud https://docs.hud.so/mcp`
-Before starting, get your HUD_API_KEY at [hud.so](https://hud.so).
-## Quickstart: Training
-RL using GRPO a Qwen2.5-VL model on any hud dataset:
-```bash
-hud get hud-evals/basic-2048 # from HF
-hud rl basic-2048.json
-```
-> See [agent training docs](https://docs.hud.so/train-agents/quickstart)
-Or make your own environment and dataset:
-```bash
-hud init my-env && cd my-env
-hud dev --interactive
-# When ready to run:
-hud rl
-```
-> See [environment design docs](https://docs.hud.so/build-environments)
-## Quickstart: Evals
-For a tutorial that explains the agent and evaluation design, run:
-```python
-uvx hud-python quickstart
-```
-Or just write your own agent loop (more [examples here](examples/)).
-```python
-import asyncio, hud, os
-from hud.settings import settings
-from hud.clients import MCPClient
-from hud.agents import ClaudeAgent
-from hud.datasets import Task  # See docs: https://docs.hud.so/reference/tasks
-async def main() -> None:
-    with hud.trace("Quick Start 2048"): # All telemetry works for any MCP-based agent (see https://hud.so)
-        task = {
-            "prompt": "Reach 64 in 2048.",
-            "mcp_config": {
-                "hud": {
-                    "url": "https://mcp.hud.so/v3/mcp",  # HUD's cloud MCP server (see https://docs.hud.so/core-concepts/architecture)
-                    "headers": {
-                        "Authorization": f"Bearer {settings.api_key}",  # Get your key at https://hud.so
-                        "Mcp-Image": "hudpython/hud-text-2048:v1.2"  # Docker image from https://hub.docker.com/u/hudpython
-                    }
-                }
-            },
-            "evaluate_tool": {"name": "evaluate", "arguments": {"name": "max_number", "arguments": {"target": 64}}},
-        }
-        task = Task(**task)
-        # 1. Define the client explicitly:
-        client = MCPClient(mcp_config=task.mcp_config)
-        agent = ClaudeAgent(
-            mcp_client=client,
-            model="claude-sonnet-4-20250514",  # requires ANTHROPIC_API_KEY
-        )
-        result = await agent.run(task)
-        # 2. Or just:
-        # result = await ClaudeAgent().run(task)
-        print(f"Reward: {result.reward}")
-        await client.shutdown()
-asyncio.run(main())
-```
-The above example let's the agent play 2048 ([See replay](https://hud.so/trace/6feed7bd-5f67-4d66-b77f-eb1e3164604f))
-![Agent playing 2048](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/2048_1.gif)
-## Reinforcement Learning with GRPO
-This is a Qwen‑2.5‑VL‑3B agent training a policy on the 2048-basic browser environment:
-![RL curve](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/rl_2.png)
-Train with the new interactive `hud rl` flow:
-```bash
-# Install CLI
-uv tool install hud-python
-# Option A: Run directly from a HuggingFace dataset
-hud rl hud-evals/basic-2048
-# Option B: Download first, modify, then train
-hud get hud-evals/basic-2048
-hud rl basic-2048.json
-# Optional: baseline evaluation
-hud eval basic-2048.json
-```
-Supports multi‑turn RL for both:
-- Language‑only models (e.g., `Qwen/Qwen2.5-7B-Instruct`)
-- Vision‑Language models (e.g., `Qwen/Qwen2.5-VL-3B-Instruct`)
-By default, `hud rl` provisions a persistent server and trainer in the cloud, streams telemetry to `hud.so`, and lets you monitor/manage models at `hud.so/models`. Use `--local` to run entirely on your machines (typically 2+ GPUs: one for vLLM, the rest for training).
-Any HUD MCP environment and evaluation works with our RL pipeline (including remote configurations). See the guided docs: `https://docs.hud.so/train-agents/quickstart`.
-Pricing: Hosted vLLM and training GPU rates are listed in the [Training Quickstart → Pricing](https://docs.hud.so/train-agents/quickstart#pricing). Manage billing at the [HUD billing dashboard](https://hud.so/project/billing).
-## Benchmarking Agents
-This is Claude Computer Use running on our proprietary financial analyst benchmark [SheetBench-50](https://huggingface.co/datasets/hud-evals/SheetBench-50):
-![Trace screenshot](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/trace_sheet.gif)
-> [See this trace on _hud.so_](https://hud.so/trace/9e212e9e-3627-4f1f-9eb5-c6d03c59070a)
-This example runs the full dataset (only takes ~20 minutes) using [run_evaluation.py](examples/run_evaluation.py):
-```bash
-python examples/run_evaluation.py hud-evals/SheetBench-50 --full --agent claude
-```
-Or in code:
-```python
-import asyncio
-from hud.datasets import run_dataset
-from hud.agents import ClaudeAgent
-results = await run_dataset(
-    name="My SheetBench-50 Evaluation",
-    dataset="hud-evals/SheetBench-50",      # <-- HuggingFace dataset
-    agent_class=ClaudeAgent,                # <-- Your custom agent can replace this (see https://docs.hud.so/evaluate-agents/create-agents)
-    agent_config={"model": "claude-sonnet-4-20250514"},
-    max_concurrent=50,
-    max_steps=30,
-)
-print(f"Average reward: {sum(r.reward for r in results) / len(results):.2f}")
-```
-> Running a dataset creates a job and streams results to the [hud.so](https://hud.so) platform for analysis and [leaderboard submission](https://docs.hud.so/evaluate-agents/leaderboards).
-## Building Environments (MCP)
-This is how you can make any environment into an interactable one in 5 steps:
-1. Define MCP server layer using [`MCPServer`](https://docs.hud.so/reference/environments)
-```python
-from hud.server import MCPServer
-from hud.tools import HudComputerTool
-mcp = MCPServer("My Environment")
-# Add hud tools (see all tools: https://docs.hud.so/reference/tools)
-mcp.tool(HudComputerTool())
-# Or custom tools (see https://docs.hud.so/build-environments/adapting-software)
-@mcp.tool("launch_app"):
-def launch_app(name: str = "Gmail")
-...
-if __name__ == "__main__":
-    mcp.run()
-```
-2. Write a simple Dockerfile that installs packages and runs:
-```python
-CMD ["python", "-m", "hud_controller.server"]
-```
-And build the image:
-```bash
-hud build # runs docker build under the hood
-```
-Or run it in interactible mode
-```bash
-hud dev
-```
-3. Debug it with the CLI to see if it launches:
-```console
-$ hud debug my-name/my-environment:latest
-✓ Phase 1: Docker image exists
-✓ Phase 2: MCP server responds to initialize
-✓ Phase 3: Tools are discoverable
-✓ Phase 4: Basic tool execution works
-✓ Phase 5: Parallel performance is good
-Progress: [█████████████████████] 5/5 phases (100%)
-✅ All phases completed successfully!
-```
-Analyze it to see if all tools appear:
-```console
-$ hud analyze hudpython/hud-remote-browser:latest
-⠏ ✓ Analysis complete
-...
-Tools
-├── Regular Tools
-│   ├── computer
-│   │   └── Control computer with mouse, keyboard, and screenshots
-...
-└── Hub Tools
-    ├── setup
-    │   ├── navigate_to_url
-    │   ├── set_cookies
-    │   ├── ...
-    └── evaluate
-        ├── url_match
-        ├── page_contains
-        ├── cookie_exists
-        ├── ...
-📡 Telemetry Data
- Live URL  https://live.anchorbrowser.io?sessionId=abc123def456
-```
-4. When the tests pass, push it up to the docker registry:
-```bash
-hud push # needs docker login, hud api key
-```
-5. Now you can use `mcp.hud.so` to launch 100s of instances of this environment in parallel with any agent, and see everything live on [hud.so](https://hud.so):
-```python
-from hud.agents import ClaudeAgent
-result = await ClaudeAgent().run({  # See all agents: https://docs.hud.so/reference/agents
-    "prompt": "Please explore this environment",
-    "mcp_config": {
-        "my-environment": {
-            "url": "https://mcp.hud.so/v3/mcp",
-            "headers": {
-                "Authorization": f"Bearer {os.getenv('HUD_API_KEY')}",
-                "Mcp-Image": "my-name/my-environment:latest"
-            }
-        }
-        # "my-environment": { # or use hud run which wraps local and remote running
-        #     "cmd": "hud",
-        #     "args": [
-        #         "run",
-        #         "my-name/my-environment:latest",
-        #     ]
-        # }
-    }
-})
-```
-> See the full environment design guide and common pitfalls in [`environments/README.md`](environments/README.md)
-## Leaderboards & benchmarks
-All leaderboards are publicly available on [hud.so/leaderboards](https://hud.so/leaderboards) (see [docs](https://docs.hud.so/evaluate-agents/leaderboards))
-![Leaderboard](https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/src/images/leaderboards_3.png)
-We highly suggest running 3-5 evaluations per dataset for the most consistent results across multiple jobs.
-Using the [`run_dataset`](https://docs.hud.so/reference/tasks#run_dataset) function with a HuggingFace dataset automatically assigns your job to that leaderboard page, and allows you to create a scorecard out of it:
-## Architecture
-```mermaid
-%%{init: {"theme": "neutral", "themeVariables": {"fontSize": "14px"}} }%%
-graph LR
-    subgraph "Platform"
-        Dashboard["📊 hud.so"]
-        API["🔌 mcp.hud.so"]
-    end
-    subgraph "hud"
-        Agent["🤖 Agent"]
-        Task["📋 Task"]
-        SDK["📦 SDK"]
-    end
-    subgraph "Environments"
-        LocalEnv["🖥️ Local Docker<br/>(Development)"]
-        RemoteEnv["☁️ Remote Docker<br/>(100s Parallel)"]
-    end
-    subgraph "otel"
-        Trace["📡 Traces & Metrics"]
-    end
-    Dataset["📚 Dataset<br/>(HuggingFace)"]
-    AnyMCP["🔗 Any MCP Client<br/>(Cursor, Claude, Custom)"]
-    Agent <--> SDK
-    Task --> SDK
-    Dataset <-.-> Task
-    SDK <-->|"MCP"| LocalEnv
-    SDK <-->|"MCP"| API
-    API  <-->|"MCP"| RemoteEnv
-    SDK  --> Trace
-    Trace --> Dashboard
-    AnyMCP -->|"MCP"| API
-```
-## CLI reference
-| Command                 | Purpose                                    | Docs |
-| ----------------------- | ------------------------------------------ | ---- |
-| [`hud init`](https://docs.hud.so/reference/cli/init)            | Create new environment with boilerplate.  | [📖](https://docs.hud.so/reference/cli/init) |
-| [`hud dev`](https://docs.hud.so/reference/cli/dev)              | Hot-reload development with Docker.        | [📖](https://docs.hud.so/reference/cli/dev) |
-| [`hud build`](https://docs.hud.so/reference/cli/build)          | Build image and generate lock file.       | [📖](https://docs.hud.so/reference/cli/build) |
-| [`hud push`](https://docs.hud.so/reference/cli/push)            | Share environment to registry.            | [📖](https://docs.hud.so/reference/cli/push) |
-| [`hud pull <target>`](https://docs.hud.so/reference/cli/pull)   | Get environment from registry.            | [📖](https://docs.hud.so/reference/cli/pull) |
-| [`hud analyze <image>`](https://docs.hud.so/reference/cli/analyze) | Discover tools, resources, and metadata.   | [📖](https://docs.hud.so/reference/cli/analyze) |
-| [`hud debug <image>`](https://docs.hud.so/reference/cli/debug)   | Five-phase health check of an environment. | [📖](https://docs.hud.so/reference/cli/debug) |
-| [`hud run <image>`](https://docs.hud.so/reference/cli/run)       | Run MCP server locally or remotely.       | [📖](https://docs.hud.so/reference/cli/run) |
-## Roadmap
-- Merging our forks in to the main `mcp`, `mcp_use` repositories
-- Helpers for building new environments (see [current guide](environments/README.md))
-- Integrations with every major agent framework
-- Evaluation environment registry
-- MCP opentelemetry standard
-## Contributing
-We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
-Key areas:
-- [Environment examples](environments/) - Add new MCP environments
-- [Agent implementations](hud/agents/) - Add support for new LLM providers
-- [Tool library](hud/tools/) - Extend the built-in tool collection
-- [RL training](hud/rl/) - Improve reinforcement learning pipelines
-Thanks to all our contributors!
-<a href="https://github.com/hud-evals/hud-python/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=hud-evals/hud-python&max=50" />
-</a>
-## Citation
-```bibtex
-@software{hud2025agentevalplatform,
-  author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Oskars Putans and Govind Pimpale and Mayank Singamreddy and Nguyen Nhat Minh},
-  title  = {HUD: An Evaluation Platform for Agents},
-  date   = {2025-04},
-  url    = {https://github.com/hud-evals/hud-python},
-  langid = {en}
-}
-```
-> **License**: HUD is released under the MIT License – see the [LICENSE](LICENSE) file for details.

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl