hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from hud.utils.tool_shorthand import (
|
|
4
|
+
_is_call_like,
|
|
5
|
+
_to_call_dict,
|
|
6
|
+
normalize_to_tool_call_dict,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_is_call_like_with_name_and_arguments():
|
|
11
|
+
"""Test _is_call_like with name and arguments keys."""
|
|
12
|
+
obj = {"name": "test_tool", "arguments": {"key": "value"}}
|
|
13
|
+
assert _is_call_like(obj) is True
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_is_call_like_with_single_key_dict_value():
|
|
17
|
+
"""Test _is_call_like with single key dict containing dict value."""
|
|
18
|
+
obj = {"tool": {"name": "test"}}
|
|
19
|
+
assert _is_call_like(obj) is True
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_is_call_like_with_nested_single_key():
|
|
23
|
+
"""Test _is_call_like with nested single key dict."""
|
|
24
|
+
obj = {"tool": {"inner": {"key": "value"}}}
|
|
25
|
+
assert _is_call_like(obj) is True
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_is_call_like_not_dict():
|
|
29
|
+
"""Test _is_call_like returns False for non-dict."""
|
|
30
|
+
assert _is_call_like("string") is False
|
|
31
|
+
assert _is_call_like(123) is False
|
|
32
|
+
assert _is_call_like(None) is False
|
|
33
|
+
assert _is_call_like([]) is False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_is_call_like_empty_dict():
|
|
37
|
+
"""Test _is_call_like returns False for empty dict."""
|
|
38
|
+
assert _is_call_like({}) is False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_is_call_like_multi_key_dict():
|
|
42
|
+
"""Test _is_call_like returns False for multi-key dict without name/arguments."""
|
|
43
|
+
obj = {"key1": "value1", "key2": "value2"}
|
|
44
|
+
assert _is_call_like(obj) is False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_to_call_dict_with_name_arguments():
|
|
48
|
+
"""Test _to_call_dict preserves name and arguments."""
|
|
49
|
+
obj = {"name": "test_tool", "arguments": {"param": "value"}}
|
|
50
|
+
result = _to_call_dict(obj)
|
|
51
|
+
assert result == {"name": "test_tool", "arguments": {"param": "value"}}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_to_call_dict_with_nested_call():
|
|
55
|
+
"""Test _to_call_dict with nested call-like arguments."""
|
|
56
|
+
obj = {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
|
|
57
|
+
result = _to_call_dict(obj)
|
|
58
|
+
assert result == {"name": "outer", "arguments": {"name": "inner", "arguments": {"x": 1}}}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_to_call_dict_shorthand_single_key():
|
|
62
|
+
"""Test _to_call_dict converts shorthand single-key dict."""
|
|
63
|
+
obj = {"tool_name": {"name": "inner", "arguments": {}}}
|
|
64
|
+
result = _to_call_dict(obj)
|
|
65
|
+
assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_to_call_dict_non_call_arguments():
|
|
69
|
+
"""Test _to_call_dict with non-call-like arguments."""
|
|
70
|
+
obj = {"name": "test", "arguments": {"simple": "value"}}
|
|
71
|
+
result = _to_call_dict(obj)
|
|
72
|
+
assert result == {"name": "test", "arguments": {"simple": "value"}}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_to_call_dict_non_dict():
|
|
76
|
+
"""Test _to_call_dict returns non-dict unchanged."""
|
|
77
|
+
assert _to_call_dict("string") == "string"
|
|
78
|
+
assert _to_call_dict(123) == 123
|
|
79
|
+
assert _to_call_dict(None) is None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def test_to_call_dict_single_key_non_call():
|
|
83
|
+
"""Test _to_call_dict with single key but non-call value."""
|
|
84
|
+
obj = {"key": "simple_value"}
|
|
85
|
+
result = _to_call_dict(obj)
|
|
86
|
+
assert result == {"key": "simple_value"}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_normalize_to_tool_call_dict_none():
|
|
90
|
+
"""Test normalize_to_tool_call_dict with None."""
|
|
91
|
+
assert normalize_to_tool_call_dict(None) is None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_normalize_to_tool_call_dict_simple_dict():
|
|
95
|
+
"""Test normalize_to_tool_call_dict with simple dict."""
|
|
96
|
+
obj = {"name": "tool", "arguments": {"x": 1}}
|
|
97
|
+
result = normalize_to_tool_call_dict(obj)
|
|
98
|
+
assert result == {"name": "tool", "arguments": {"x": 1}}
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_normalize_to_tool_call_dict_shorthand():
|
|
102
|
+
"""Test normalize_to_tool_call_dict with shorthand notation."""
|
|
103
|
+
obj = {"tool_name": {"name": "inner", "arguments": {}}}
|
|
104
|
+
result = normalize_to_tool_call_dict(obj)
|
|
105
|
+
assert result == {"name": "tool_name", "arguments": {"name": "inner", "arguments": {}}}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def test_normalize_to_tool_call_dict_list():
|
|
109
|
+
"""Test normalize_to_tool_call_dict with list of dicts."""
|
|
110
|
+
obj = [
|
|
111
|
+
{"name": "tool1", "arguments": {"a": 1}},
|
|
112
|
+
{"name": "tool2", "arguments": {"b": 2}},
|
|
113
|
+
]
|
|
114
|
+
result = normalize_to_tool_call_dict(obj)
|
|
115
|
+
assert len(result) == 2
|
|
116
|
+
assert result[0] == {"name": "tool1", "arguments": {"a": 1}}
|
|
117
|
+
assert result[1] == {"name": "tool2", "arguments": {"b": 2}}
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def test_normalize_to_tool_call_dict_list_shorthand():
|
|
121
|
+
"""Test normalize_to_tool_call_dict with list of shorthand dicts."""
|
|
122
|
+
obj = [
|
|
123
|
+
{"tool1": {"name": "inner1", "arguments": {}}},
|
|
124
|
+
{"tool2": {"name": "inner2", "arguments": {}}},
|
|
125
|
+
]
|
|
126
|
+
result = normalize_to_tool_call_dict(obj)
|
|
127
|
+
assert len(result) == 2
|
|
128
|
+
assert result[0]["name"] == "tool1"
|
|
129
|
+
assert result[1]["name"] == "tool2"
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def test_normalize_to_tool_call_dict_non_dict_non_list():
|
|
133
|
+
"""Test normalize_to_tool_call_dict with non-dict, non-list value."""
|
|
134
|
+
assert normalize_to_tool_call_dict("string") == "string"
|
|
135
|
+
assert normalize_to_tool_call_dict(123) == 123
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_normalize_to_tool_call_dict_empty_list():
|
|
139
|
+
"""Test normalize_to_tool_call_dict with empty list."""
|
|
140
|
+
assert normalize_to_tool_call_dict([]) == []
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_normalize_to_tool_call_dict_complex_nested():
|
|
144
|
+
"""Test normalize_to_tool_call_dict with complex nested structure."""
|
|
145
|
+
obj = {
|
|
146
|
+
"outer_tool": {
|
|
147
|
+
"name": "middle_tool",
|
|
148
|
+
"arguments": {"name": "inner_tool", "arguments": {"x": 1}},
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
result = normalize_to_tool_call_dict(obj)
|
|
152
|
+
assert result["name"] == "outer_tool"
|
|
153
|
+
assert result["arguments"]["name"] == "middle_tool"
|
|
154
|
+
assert result["arguments"]["arguments"]["name"] == "inner_tool"
|
hud/utils/tests/test_version.py
CHANGED
hud/utils/types.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
|
|
8
|
+
P = ParamSpec("P")
|
|
9
|
+
R = TypeVar("R")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def with_signature(
|
|
13
|
+
params_cls: Callable[P, Any],
|
|
14
|
+
) -> Callable[[Callable[..., R]], Callable[P, R]]:
|
|
15
|
+
"""Decorator that gives a method the signature of a Pydantic model."""
|
|
16
|
+
|
|
17
|
+
def decorator(method: Callable[..., R]) -> Callable[P, R]:
|
|
18
|
+
return method # type: ignore[return-value]
|
|
19
|
+
|
|
20
|
+
return decorator
|
hud/version.py
CHANGED
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hud-python
|
|
3
|
+
Version: 0.5.1
|
|
4
|
+
Summary: SDK for the HUD platform.
|
|
5
|
+
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
|
+
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
7
|
+
Project-URL: Documentation, https://docs.hud.ai
|
|
8
|
+
Author-email: HUD <founders@hud.ai>
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2025 Human Union Data, Inc
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Classifier: Development Status :: 4 - Beta
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
37
|
+
Requires-Python: <3.13,>=3.11
|
|
38
|
+
Requires-Dist: blessed>=1.20.0
|
|
39
|
+
Requires-Dist: fastmcp==2.13.3
|
|
40
|
+
Requires-Dist: httpx<1,>=0.23.0
|
|
41
|
+
Requires-Dist: mcp<1.23,>1.21.1
|
|
42
|
+
Requires-Dist: openai>=2.8.1
|
|
43
|
+
Requires-Dist: packaging>=21.0
|
|
44
|
+
Requires-Dist: prompt-toolkit==3.0.51
|
|
45
|
+
Requires-Dist: pydantic-settings<3,>=2.2
|
|
46
|
+
Requires-Dist: pydantic<3,>=2.6
|
|
47
|
+
Requires-Dist: questionary==2.1.0
|
|
48
|
+
Requires-Dist: rich>=13.0.0
|
|
49
|
+
Requires-Dist: scarf-sdk>=0.1.0
|
|
50
|
+
Requires-Dist: toml>=0.10.2
|
|
51
|
+
Requires-Dist: typer>=0.9.0
|
|
52
|
+
Requires-Dist: watchfiles>=0.21.0
|
|
53
|
+
Provides-Extra: agent
|
|
54
|
+
Requires-Dist: anthropic>=0.75; extra == 'agent'
|
|
55
|
+
Requires-Dist: datasets>=2.14.0; extra == 'agent'
|
|
56
|
+
Requires-Dist: google-genai; extra == 'agent'
|
|
57
|
+
Requires-Dist: langchain>=1.1.0; extra == 'agent'
|
|
58
|
+
Requires-Dist: mcp-use==1.5.0; extra == 'agent'
|
|
59
|
+
Requires-Dist: openai-agents; extra == 'agent'
|
|
60
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agent'
|
|
61
|
+
Requires-Dist: tornado>=6.5.2; extra == 'agent'
|
|
62
|
+
Provides-Extra: agents
|
|
63
|
+
Requires-Dist: anthropic>=0.75; extra == 'agents'
|
|
64
|
+
Requires-Dist: datasets>=2.14.0; extra == 'agents'
|
|
65
|
+
Requires-Dist: google-genai; extra == 'agents'
|
|
66
|
+
Requires-Dist: langchain>=1.1.0; extra == 'agents'
|
|
67
|
+
Requires-Dist: mcp-use==1.5.0; extra == 'agents'
|
|
68
|
+
Requires-Dist: openai-agents; extra == 'agents'
|
|
69
|
+
Requires-Dist: pillow>=11.1.0; extra == 'agents'
|
|
70
|
+
Requires-Dist: tornado>=6.5.2; extra == 'agents'
|
|
71
|
+
Provides-Extra: bedrock
|
|
72
|
+
Requires-Dist: anthropic[bedrock]>=0.75; extra == 'bedrock'
|
|
73
|
+
Provides-Extra: dev
|
|
74
|
+
Requires-Dist: anthropic>=0.75; extra == 'dev'
|
|
75
|
+
Requires-Dist: datasets>=2.14.0; extra == 'dev'
|
|
76
|
+
Requires-Dist: dotenv>=0.9.9; extra == 'dev'
|
|
77
|
+
Requires-Dist: google-adk; extra == 'dev'
|
|
78
|
+
Requires-Dist: google-genai; extra == 'dev'
|
|
79
|
+
Requires-Dist: ipykernel; extra == 'dev'
|
|
80
|
+
Requires-Dist: ipython<9; extra == 'dev'
|
|
81
|
+
Requires-Dist: jupyter-client; extra == 'dev'
|
|
82
|
+
Requires-Dist: jupyter-core; extra == 'dev'
|
|
83
|
+
Requires-Dist: langchain>=1.1.0; extra == 'dev'
|
|
84
|
+
Requires-Dist: llama-index-core; extra == 'dev'
|
|
85
|
+
Requires-Dist: mcp-use==1.5.0; extra == 'dev'
|
|
86
|
+
Requires-Dist: openai-agents; extra == 'dev'
|
|
87
|
+
Requires-Dist: pillow>=11.1.0; extra == 'dev'
|
|
88
|
+
Requires-Dist: playwright; extra == 'dev'
|
|
89
|
+
Requires-Dist: pyautogui>=0.9.54; extra == 'dev'
|
|
90
|
+
Requires-Dist: pyright==1.1.407; extra == 'dev'
|
|
91
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
92
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
93
|
+
Requires-Dist: pytest-mock; extra == 'dev'
|
|
94
|
+
Requires-Dist: pytest<9,>=8.1.1; extra == 'dev'
|
|
95
|
+
Requires-Dist: ruff>=0.11.8; extra == 'dev'
|
|
96
|
+
Requires-Dist: tornado>=6.5.2; extra == 'dev'
|
|
97
|
+
Description-Content-Type: text/markdown
|
|
98
|
+
|
|
99
|
+
<div align="left">
|
|
100
|
+
<picture>
|
|
101
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo_dark.svg">
|
|
102
|
+
<source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg">
|
|
103
|
+
<img src="https://raw.githubusercontent.com/hud-evals/hud-python/main/docs/logo/hud_logo.svg" alt="HUD" width="150" style="margin-bottom: 24px;"/>
|
|
104
|
+
</picture>
|
|
105
|
+
</div>
|
|
106
|
+
|
|
107
|
+
The HUD SDK is an open-source Python toolkit for building, evaluating, and training AI agents. Use a unified API for any model provider, wrap your code as MCP environments, run A/B evals at scale, and train with reinforcement learning.
|
|
108
|
+
|
|
109
|
+
To learn more, check out our [Documentation](https://docs.hud.ai) and [API Reference](https://docs.hud.ai/reference).
|
|
110
|
+
|
|
111
|
+
[](https://pypi.org/project/hud-python/)
|
|
112
|
+
[](LICENSE)
|
|
113
|
+
[](https://cursor.com/en/install-mcp?name=docs-hud-python&config=eyJ1cmwiOiJodHRwczovL2RvY3MuaHVkLmFpL21jcCJ9)
|
|
114
|
+
[](https://discord.gg/wkjtmHYYjm)
|
|
115
|
+
[](https://x.com/intent/user?screen_name=hud_evals)
|
|
116
|
+
[](https://shop.hud.ai)
|
|
117
|
+
[](https://scarf.sh)
|
|
118
|
+
[](https://docs.hud.ai)
|
|
119
|
+
|
|
120
|
+
## Install
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install hud-python
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Get your API key at [hud.ai](https://hud.ai) and set it:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
export HUD_API_KEY=your-key-here
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
> For CLI tools (`hud init`, `hud dev`, etc.): `uv tool install hud-python --python 3.12`
|
|
133
|
+
|
|
134
|
+

|
|
135
|
+
|
|
136
|
+
## Usage
|
|
137
|
+
|
|
138
|
+
### Unified Model API
|
|
139
|
+
|
|
140
|
+
Use Claude, GPT, Gemini, or Grok through one OpenAI-compatible endpoint:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
from openai import AsyncOpenAI
|
|
144
|
+
import os
|
|
145
|
+
|
|
146
|
+
client = AsyncOpenAI(
|
|
147
|
+
base_url="https://inference.hud.ai",
|
|
148
|
+
api_key=os.environ["HUD_API_KEY"]
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
response = await client.chat.completions.create(
|
|
152
|
+
model="claude-sonnet-4-5", # or gpt-4o, gemini-2.5-pro (https://hud.ai/models)
|
|
153
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
|
154
|
+
)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
Every call is traced at [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/gateway)
|
|
158
|
+
|
|
159
|
+
### Environments
|
|
160
|
+
|
|
161
|
+
Turn your code into tools agents can call. Define how to evaluate them:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from hud import Environment
|
|
165
|
+
|
|
166
|
+
env = Environment("my-env")
|
|
167
|
+
|
|
168
|
+
@env.tool()
|
|
169
|
+
def add(a: int, b: int) -> int:
|
|
170
|
+
"""Add two numbers."""
|
|
171
|
+
return a + b
|
|
172
|
+
|
|
173
|
+
@env.scenario("solve-math")
|
|
174
|
+
async def solve_math(problem: str, answer: int):
|
|
175
|
+
response = yield problem # Prompt
|
|
176
|
+
yield 1.0 if str(answer) in response else 0.0 # Reward
|
|
177
|
+
|
|
178
|
+
async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
|
|
179
|
+
# Your agent logic here - call tools, get response
|
|
180
|
+
result = await ctx.call_tool("add", a=2, b=2)
|
|
181
|
+
await ctx.submit(f"The answer is {result}")
|
|
182
|
+
|
|
183
|
+
print(ctx.reward) # 1.0
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
|
|
187
|
+
|
|
188
|
+
### A/B Evals
|
|
189
|
+
|
|
190
|
+
Test different models. Repeat runs to see the distribution:
|
|
191
|
+
|
|
192
|
+
```python
|
|
193
|
+
from openai import AsyncOpenAI
|
|
194
|
+
import os
|
|
195
|
+
|
|
196
|
+
client = AsyncOpenAI(
|
|
197
|
+
base_url="https://inference.hud.ai",
|
|
198
|
+
api_key=os.environ["HUD_API_KEY"]
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
# Using the env from above
|
|
202
|
+
async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
|
|
203
|
+
response = await client.chat.completions.create(
|
|
204
|
+
model=ctx.variants["model"],
|
|
205
|
+
messages=[{"role": "user", "content": ctx.prompt}],
|
|
206
|
+
tools=ctx.tools # Environment tools available to the model
|
|
207
|
+
)
|
|
208
|
+
await ctx.submit(response.choices[0].message.content)
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
**Variants** test configurations. **Groups** repeat for distribution. Results stream to [hud.ai](https://hud.ai). → [Docs](https://docs.hud.ai/quick-links/ab-testing)
|
|
212
|
+
|
|
213
|
+
### Deploy & Train
|
|
214
|
+
|
|
215
|
+
Push to GitHub, connect on hud.ai, run at scale:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
hud init # Scaffold environment
|
|
219
|
+
git push # Push to GitHub
|
|
220
|
+
# Connect on hud.ai → New → Environment
|
|
221
|
+
hud eval my-eval --model gpt-4o --group-size 100
|
|
222
|
+
# Or create and run tasks on the platform
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Every run generates training data. Use it to fine-tune or run RL. → [Docs](https://docs.hud.ai/quick-links/deploy)
|
|
226
|
+
|
|
227
|
+
## Links
|
|
228
|
+
|
|
229
|
+
- 📖 [Documentation](https://docs.hud.ai)
|
|
230
|
+
- ⌨️ [CLI Reference](https://docs.hud.ai/reference/cli/overview)
|
|
231
|
+
- 🏆 [Leaderboards](https://hud.ai/leaderboards)
|
|
232
|
+
- 🌐 [Environment Templates](https://hud.ai/environments)
|
|
233
|
+
- 🤖 [Supported Models](https://hud.ai/models)
|
|
234
|
+
- 💬 [Discord](https://discord.gg/wkjtmHYYjm)
|
|
235
|
+
|
|
236
|
+
## Enterprise
|
|
237
|
+
|
|
238
|
+
Building agents at scale? We work with teams on custom environments, benchmarks, and training.
|
|
239
|
+
|
|
240
|
+
[📅 Book a call](https://cal.com/jay-hud) · [📧 founders@hud.ai](mailto:founders@hud.ai)
|
|
241
|
+
|
|
242
|
+
## Contributing
|
|
243
|
+
|
|
244
|
+
We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md).
|
|
245
|
+
|
|
246
|
+
Key areas: [Agents](hud/agents/) · [Tools](hud/tools/) · [Environments](https://hud.ai/environments)
|
|
247
|
+
|
|
248
|
+
<a href="https://github.com/hud-evals/hud-python/graphs/contributors">
|
|
249
|
+
<img src="https://contrib.rocks/image?repo=hud-evals/hud-python&max=50" />
|
|
250
|
+
</a>
|
|
251
|
+
|
|
252
|
+
## Citation
|
|
253
|
+
|
|
254
|
+
```bibtex
|
|
255
|
+
@software{hud2025agentevalplatform,
|
|
256
|
+
author = {HUD and Jay Ram and Lorenss Martinsons and Parth Patel and Govind Pimpale and Dylan Bowman and Jaideep and Nguyen Nhat Minh},
|
|
257
|
+
title = {HUD: An Evaluation and RL Envrionments Platform for Agents},
|
|
258
|
+
date = {2025-04},
|
|
259
|
+
url = {https://github.com/hud-evals/hud-python},
|
|
260
|
+
langid = {en}
|
|
261
|
+
}
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
MIT License · [LICENSE](LICENSE)
|