hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from hud.telemetry.instrument import _serialize_value, instrument
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_serialize_value_simple_types():
|
|
11
|
+
"""Test _serialize_value with simple types."""
|
|
12
|
+
assert _serialize_value("string") == "string"
|
|
13
|
+
assert _serialize_value(42) == 42
|
|
14
|
+
assert _serialize_value(3.14) == 3.14
|
|
15
|
+
assert _serialize_value(True) is True
|
|
16
|
+
assert _serialize_value(None) is None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_serialize_value_list():
|
|
20
|
+
"""Test _serialize_value with lists."""
|
|
21
|
+
result = _serialize_value([1, 2, 3])
|
|
22
|
+
assert result == [1, 2, 3]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_serialize_value_list_truncation():
|
|
26
|
+
"""Test _serialize_value truncates long lists."""
|
|
27
|
+
long_list = list(range(20))
|
|
28
|
+
result = _serialize_value(long_list, max_items=5)
|
|
29
|
+
assert len(result) == 5
|
|
30
|
+
assert result == [0, 1, 2, 3, 4]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_serialize_value_tuple():
|
|
34
|
+
"""Test _serialize_value with tuples."""
|
|
35
|
+
result = _serialize_value((1, 2, 3))
|
|
36
|
+
assert result == [1, 2, 3] # Converted to list by JSON
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def test_serialize_value_tuple_truncation():
|
|
40
|
+
"""Test _serialize_value truncates long tuples."""
|
|
41
|
+
long_tuple = tuple(range(20))
|
|
42
|
+
result = _serialize_value(long_tuple, max_items=5)
|
|
43
|
+
assert len(result) == 5
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_serialize_value_dict():
|
|
47
|
+
"""Test _serialize_value with dicts."""
|
|
48
|
+
result = _serialize_value({"key": "value"})
|
|
49
|
+
assert result == {"key": "value"}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_serialize_value_dict_truncation():
|
|
53
|
+
"""Test _serialize_value truncates large dicts."""
|
|
54
|
+
large_dict = {f"key{i}": i for i in range(20)}
|
|
55
|
+
result = _serialize_value(large_dict, max_items=5)
|
|
56
|
+
assert len(result) == 5
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_serialize_value_complex_object():
|
|
60
|
+
"""Test _serialize_value with custom objects."""
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class CustomObj:
|
|
64
|
+
name: str
|
|
65
|
+
value: int
|
|
66
|
+
|
|
67
|
+
obj = CustomObj(name="test", value=42)
|
|
68
|
+
result = _serialize_value(obj)
|
|
69
|
+
assert isinstance(result, dict)
|
|
70
|
+
assert result["name"] == "test"
|
|
71
|
+
assert result["value"] == 42
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_serialize_value_fallback():
|
|
75
|
+
"""Test _serialize_value fallback for non-serializable objects."""
|
|
76
|
+
|
|
77
|
+
class WeirdObj:
|
|
78
|
+
def __init__(self):
|
|
79
|
+
raise Exception("Can't access")
|
|
80
|
+
|
|
81
|
+
obj = WeirdObj.__new__(WeirdObj)
|
|
82
|
+
result = _serialize_value(obj)
|
|
83
|
+
# The result is a string representation of the object
|
|
84
|
+
assert isinstance(result, str)
|
|
85
|
+
assert "WeirdObj" in result
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@pytest.mark.asyncio
|
|
89
|
+
async def test_instrument_async_basic():
|
|
90
|
+
"""Test instrument decorator on async function."""
|
|
91
|
+
|
|
92
|
+
@instrument
|
|
93
|
+
async def test_func(x: int, y: int) -> int:
|
|
94
|
+
return x + y
|
|
95
|
+
|
|
96
|
+
result = await test_func(2, 3)
|
|
97
|
+
assert result == 5
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@pytest.mark.asyncio
|
|
101
|
+
async def test_instrument_async_with_params():
|
|
102
|
+
"""Test instrument with custom parameters."""
|
|
103
|
+
|
|
104
|
+
@instrument(name="custom_name", category="custom_type")
|
|
105
|
+
async def test_func(x: int) -> int:
|
|
106
|
+
return x * 2
|
|
107
|
+
|
|
108
|
+
result = await test_func(5)
|
|
109
|
+
assert result == 10
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@pytest.mark.asyncio
|
|
113
|
+
async def test_instrument_async_with_exception():
|
|
114
|
+
"""Test instrument handles exceptions."""
|
|
115
|
+
|
|
116
|
+
@instrument
|
|
117
|
+
async def test_func():
|
|
118
|
+
raise ValueError("Test error")
|
|
119
|
+
|
|
120
|
+
with pytest.raises(ValueError, match="Test error"):
|
|
121
|
+
await test_func()
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.mark.asyncio
|
|
125
|
+
async def test_instrument_async_no_record_args():
|
|
126
|
+
"""Test instrument with record_args=False."""
|
|
127
|
+
|
|
128
|
+
@instrument(record_args=False)
|
|
129
|
+
async def test_func(x: int) -> int:
|
|
130
|
+
return x
|
|
131
|
+
|
|
132
|
+
result = await test_func(42)
|
|
133
|
+
assert result == 42
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@pytest.mark.asyncio
|
|
137
|
+
async def test_instrument_async_no_record_result():
|
|
138
|
+
"""Test instrument with record_result=False."""
|
|
139
|
+
|
|
140
|
+
@instrument(record_result=False)
|
|
141
|
+
async def test_func() -> str:
|
|
142
|
+
return "test"
|
|
143
|
+
|
|
144
|
+
result = await test_func()
|
|
145
|
+
assert result == "test"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
@pytest.mark.asyncio
|
|
149
|
+
async def test_instrument_async_with_category():
|
|
150
|
+
"""Test instrument with custom category."""
|
|
151
|
+
|
|
152
|
+
@instrument(category="agent")
|
|
153
|
+
async def test_func() -> int:
|
|
154
|
+
return 42
|
|
155
|
+
|
|
156
|
+
result = await test_func()
|
|
157
|
+
assert result == 42
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_instrument_sync_basic():
|
|
161
|
+
"""Test instrument decorator on sync function."""
|
|
162
|
+
|
|
163
|
+
@instrument
|
|
164
|
+
def test_func(x: int, y: int) -> int:
|
|
165
|
+
return x + y
|
|
166
|
+
|
|
167
|
+
result = test_func(2, 3)
|
|
168
|
+
assert result == 5
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_instrument_sync_with_params():
|
|
172
|
+
"""Test instrument on sync function with parameters."""
|
|
173
|
+
|
|
174
|
+
@instrument(name="sync_custom", category="sync_type")
|
|
175
|
+
def test_func(x: int) -> int:
|
|
176
|
+
return x * 2
|
|
177
|
+
|
|
178
|
+
result = test_func(5)
|
|
179
|
+
assert result == 10
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_instrument_sync_with_exception():
|
|
183
|
+
"""Test instrument handles exceptions in sync functions."""
|
|
184
|
+
|
|
185
|
+
@instrument
|
|
186
|
+
def test_func():
|
|
187
|
+
raise ValueError("Sync error")
|
|
188
|
+
|
|
189
|
+
with pytest.raises(ValueError, match="Sync error"):
|
|
190
|
+
test_func()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def test_instrument_sync_no_record_args():
|
|
194
|
+
"""Test instrument sync with record_args=False."""
|
|
195
|
+
|
|
196
|
+
@instrument(record_args=False)
|
|
197
|
+
def test_func(x: int) -> int:
|
|
198
|
+
return x
|
|
199
|
+
|
|
200
|
+
result = test_func(42)
|
|
201
|
+
assert result == 42
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def test_instrument_sync_no_record_result():
|
|
205
|
+
"""Test instrument sync with record_result=False."""
|
|
206
|
+
|
|
207
|
+
@instrument(record_result=False)
|
|
208
|
+
def test_func() -> str:
|
|
209
|
+
return "test"
|
|
210
|
+
|
|
211
|
+
result = test_func()
|
|
212
|
+
assert result == "test"
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def test_instrument_sync_with_category():
|
|
216
|
+
"""Test instrument sync with custom category."""
|
|
217
|
+
|
|
218
|
+
@instrument(category="tool")
|
|
219
|
+
def test_func() -> int:
|
|
220
|
+
return 42
|
|
221
|
+
|
|
222
|
+
result = test_func()
|
|
223
|
+
assert result == 42
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def test_instrument_already_instrumented():
|
|
227
|
+
"""Test that instrumenting already instrumented function is skipped."""
|
|
228
|
+
|
|
229
|
+
@instrument
|
|
230
|
+
def test_func():
|
|
231
|
+
return "original"
|
|
232
|
+
|
|
233
|
+
# Try to instrument again
|
|
234
|
+
test_func2 = instrument(test_func)
|
|
235
|
+
|
|
236
|
+
# Should be the same function
|
|
237
|
+
assert test_func2 is test_func
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_instrument_marks_as_instrumented():
|
|
241
|
+
"""Test that instrument marks functions correctly."""
|
|
242
|
+
|
|
243
|
+
@instrument
|
|
244
|
+
def test_func():
|
|
245
|
+
return True
|
|
246
|
+
|
|
247
|
+
assert hasattr(test_func, "_hud_instrumented")
|
|
248
|
+
assert test_func._hud_instrumented is True
|
|
249
|
+
assert hasattr(test_func, "_hud_original")
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
@pytest.mark.asyncio
|
|
253
|
+
async def test_instrument_async_complex_result():
|
|
254
|
+
"""Test instrument with complex result object."""
|
|
255
|
+
|
|
256
|
+
@instrument
|
|
257
|
+
async def test_func() -> dict:
|
|
258
|
+
return {"nested": {"data": [1, 2, 3]}, "count": 3}
|
|
259
|
+
|
|
260
|
+
result = await test_func()
|
|
261
|
+
assert result["count"] == 3
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def test_instrument_sync_complex_result():
|
|
265
|
+
"""Test instrument sync with complex result."""
|
|
266
|
+
|
|
267
|
+
@dataclass
|
|
268
|
+
class Result:
|
|
269
|
+
value: int
|
|
270
|
+
name: str
|
|
271
|
+
|
|
272
|
+
@instrument
|
|
273
|
+
def test_func() -> Result:
|
|
274
|
+
return Result(value=42, name="test")
|
|
275
|
+
|
|
276
|
+
result = test_func()
|
|
277
|
+
assert result.value == 42
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@pytest.mark.asyncio
|
|
281
|
+
async def test_instrument_async_with_self_param():
|
|
282
|
+
"""Test instrument properly handles 'self' parameter."""
|
|
283
|
+
|
|
284
|
+
class TestClass:
|
|
285
|
+
@instrument
|
|
286
|
+
async def method(self, x: int) -> int:
|
|
287
|
+
return x * 2
|
|
288
|
+
|
|
289
|
+
obj = TestClass()
|
|
290
|
+
result = await obj.method(5)
|
|
291
|
+
assert result == 10
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def test_instrument_sync_with_cls_param():
|
|
295
|
+
"""Test instrument properly handles 'cls' parameter."""
|
|
296
|
+
|
|
297
|
+
class TestClass:
|
|
298
|
+
@classmethod
|
|
299
|
+
@instrument
|
|
300
|
+
def method(cls, x: int) -> int:
|
|
301
|
+
return x * 3
|
|
302
|
+
|
|
303
|
+
result = TestClass.method(4)
|
|
304
|
+
assert result == 12
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
@pytest.mark.asyncio
|
|
308
|
+
async def test_instrument_async_serialization_error():
|
|
309
|
+
"""Test instrument handles serialization errors gracefully."""
|
|
310
|
+
|
|
311
|
+
class UnserializableArg:
|
|
312
|
+
def __getattribute__(self, name):
|
|
313
|
+
raise Exception("Can't serialize")
|
|
314
|
+
|
|
315
|
+
@instrument
|
|
316
|
+
async def test_func(arg):
|
|
317
|
+
return "success"
|
|
318
|
+
|
|
319
|
+
# Should not raise, just skip serialization
|
|
320
|
+
result = await test_func(UnserializableArg())
|
|
321
|
+
assert result == "success"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def test_instrument_function_without_signature():
|
|
325
|
+
"""Test instrument on functions without inspectable signature."""
|
|
326
|
+
# Built-in functions don't have signatures
|
|
327
|
+
instrumented_len = instrument(len)
|
|
328
|
+
result = instrumented_len([1, 2, 3])
|
|
329
|
+
assert result == 3
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
@pytest.mark.asyncio
|
|
333
|
+
async def test_instrument_async_result_serialization_error():
|
|
334
|
+
"""Test instrument handles result serialization errors."""
|
|
335
|
+
|
|
336
|
+
class UnserializableResult:
|
|
337
|
+
def __iter__(self):
|
|
338
|
+
raise Exception("Can't iterate")
|
|
339
|
+
|
|
340
|
+
@instrument
|
|
341
|
+
async def test_func():
|
|
342
|
+
return UnserializableResult()
|
|
343
|
+
|
|
344
|
+
# Should not raise, just skip result recording
|
|
345
|
+
result = await test_func()
|
|
346
|
+
assert isinstance(result, UnserializableResult)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_instrument_without_parentheses():
|
|
350
|
+
"""Test using @instrument without parentheses."""
|
|
351
|
+
|
|
352
|
+
@instrument
|
|
353
|
+
def test_func(x: int) -> int:
|
|
354
|
+
return x + 1
|
|
355
|
+
|
|
356
|
+
assert test_func(5) == 6
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def test_instrument_with_parentheses():
|
|
360
|
+
"""Test using @instrument() with parentheses."""
|
|
361
|
+
|
|
362
|
+
@instrument()
|
|
363
|
+
def test_func(x: int) -> int:
|
|
364
|
+
return x + 1
|
|
365
|
+
|
|
366
|
+
assert test_func(5) == 6
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
@pytest.mark.asyncio
|
|
370
|
+
async def test_instrument_async_with_defaults():
|
|
371
|
+
"""Test instrument with function that has default arguments."""
|
|
372
|
+
|
|
373
|
+
@instrument
|
|
374
|
+
async def test_func(x: int, y: int = 10) -> int:
|
|
375
|
+
return x + y
|
|
376
|
+
|
|
377
|
+
assert await test_func(5) == 15
|
|
378
|
+
assert await test_func(5, 20) == 25
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def test_instrument_sync_with_kwargs():
|
|
382
|
+
"""Test instrument with keyword arguments."""
|
|
383
|
+
|
|
384
|
+
@instrument
|
|
385
|
+
def test_func(x: int, **kwargs) -> dict:
|
|
386
|
+
return {"x": x, **kwargs}
|
|
387
|
+
|
|
388
|
+
result = test_func(1, a=2, b=3)
|
|
389
|
+
assert result == {"x": 1, "a": 2, "b": 3}
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
@pytest.mark.asyncio
|
|
393
|
+
async def test_instrument_async_with_varargs():
|
|
394
|
+
"""Test instrument with *args."""
|
|
395
|
+
|
|
396
|
+
@instrument
|
|
397
|
+
async def test_func(*args) -> int:
|
|
398
|
+
return sum(args)
|
|
399
|
+
|
|
400
|
+
result = await test_func(1, 2, 3, 4)
|
|
401
|
+
assert result == 10
|
hud/tools/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
+
from .agent import AgentTool
|
|
7
8
|
from .base import BaseHub, BaseTool
|
|
8
9
|
from .bash import BashTool
|
|
9
10
|
from .edit import EditTool
|
|
@@ -12,17 +13,26 @@ from .response import ResponseTool
|
|
|
12
13
|
from .submit import SubmitTool
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
15
|
-
from .computer import
|
|
16
|
+
from .computer import (
|
|
17
|
+
AnthropicComputerTool,
|
|
18
|
+
GeminiComputerTool,
|
|
19
|
+
HudComputerTool,
|
|
20
|
+
OpenAIComputerTool,
|
|
21
|
+
QwenComputerTool,
|
|
22
|
+
)
|
|
16
23
|
|
|
17
24
|
__all__ = [
|
|
25
|
+
"AgentTool",
|
|
18
26
|
"AnthropicComputerTool",
|
|
19
27
|
"BaseHub",
|
|
20
28
|
"BaseTool",
|
|
21
29
|
"BashTool",
|
|
22
30
|
"EditTool",
|
|
31
|
+
"GeminiComputerTool",
|
|
23
32
|
"HudComputerTool",
|
|
24
33
|
"OpenAIComputerTool",
|
|
25
34
|
"PlaywrightTool",
|
|
35
|
+
"QwenComputerTool",
|
|
26
36
|
"ResponseTool",
|
|
27
37
|
"SubmitTool",
|
|
28
38
|
]
|
|
@@ -30,7 +40,13 @@ __all__ = [
|
|
|
30
40
|
|
|
31
41
|
def __getattr__(name: str) -> Any:
|
|
32
42
|
"""Lazy import computer tools to avoid importing pyautogui unless needed."""
|
|
33
|
-
if name in (
|
|
43
|
+
if name in (
|
|
44
|
+
"AnthropicComputerTool",
|
|
45
|
+
"HudComputerTool",
|
|
46
|
+
"OpenAIComputerTool",
|
|
47
|
+
"GeminiComputerTool",
|
|
48
|
+
"QwenComputerTool",
|
|
49
|
+
):
|
|
34
50
|
from . import computer
|
|
35
51
|
|
|
36
52
|
return getattr(computer, name)
|
hud/tools/agent.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""AgentTool - run a Task with an agent as a tool."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
|
|
7
|
+
|
|
8
|
+
from fastmcp.tools.tool import FunctionTool, ToolResult
|
|
9
|
+
from mcp.types import TextContent
|
|
10
|
+
|
|
11
|
+
from hud.tools.base import BaseTool
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from hud.agents.base import MCPAgent
|
|
15
|
+
from hud.eval.task import Task
|
|
16
|
+
|
|
17
|
+
__all__ = ["AgentTool"]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _is_eval_only(param: inspect.Parameter) -> bool:
|
|
21
|
+
"""Check if param is eval-only: has None default AND None in type union.
|
|
22
|
+
|
|
23
|
+
Handles both runtime types and string annotations (PEP 563).
|
|
24
|
+
"""
|
|
25
|
+
# Must have default of None
|
|
26
|
+
if param.default is not None:
|
|
27
|
+
return False
|
|
28
|
+
if param.annotation is inspect.Parameter.empty:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
annotation = param.annotation
|
|
32
|
+
|
|
33
|
+
# Handle string annotations (from __future__ annotations or quoted)
|
|
34
|
+
if isinstance(annotation, str):
|
|
35
|
+
# Check if it looks like "X | None", "Union[X, None]", or "Optional[X]"
|
|
36
|
+
return (
|
|
37
|
+
"| None" in annotation
|
|
38
|
+
or "None |" in annotation
|
|
39
|
+
or "Optional[" in annotation
|
|
40
|
+
or ("Union[" in annotation and "None" in annotation)
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# Handle runtime type annotations
|
|
44
|
+
origin = get_origin(annotation)
|
|
45
|
+
|
|
46
|
+
# Union types (X | None or Union[X, None])
|
|
47
|
+
if origin is Union:
|
|
48
|
+
return type(None) in get_args(annotation)
|
|
49
|
+
|
|
50
|
+
# For Python 3.10+ union syntax at runtime (types.UnionType)
|
|
51
|
+
try:
|
|
52
|
+
import types
|
|
53
|
+
|
|
54
|
+
if isinstance(annotation, types.UnionType):
|
|
55
|
+
return type(None) in get_args(annotation)
|
|
56
|
+
except (ImportError, AttributeError):
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class AgentTool(BaseTool):
|
|
63
|
+
"""Tool that runs a Task template with an agent.
|
|
64
|
+
|
|
65
|
+
Parameters with `| None = None` are eval-only and hidden from the tool schema.
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
```python
|
|
69
|
+
@env.scenario()
|
|
70
|
+
async def investigate(
|
|
71
|
+
issue_id: str, # Required - orchestrator sees
|
|
72
|
+
expected_cause: str | None = None, # Eval only - hidden
|
|
73
|
+
):
|
|
74
|
+
yield {"task": f"Investigate {issue_id}"}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
seer = AgentTool(env("investigate"), model="ft:seer-v2")
|
|
78
|
+
```
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
task: Task,
|
|
84
|
+
*,
|
|
85
|
+
model: str | None = None,
|
|
86
|
+
agent: type[MCPAgent] | None = None,
|
|
87
|
+
agent_params: dict[str, Any] | None = None,
|
|
88
|
+
name: str | None = None,
|
|
89
|
+
description: str | None = None,
|
|
90
|
+
trace: bool = False,
|
|
91
|
+
) -> None:
|
|
92
|
+
if not model and agent is None:
|
|
93
|
+
raise ValueError("Must provide either 'model' or 'agent'")
|
|
94
|
+
if model and agent is not None:
|
|
95
|
+
raise ValueError("Cannot provide both 'model' and 'agent'")
|
|
96
|
+
|
|
97
|
+
self._task = task
|
|
98
|
+
self._model = model
|
|
99
|
+
self._agent_cls = agent
|
|
100
|
+
self._agent_params = agent_params or {}
|
|
101
|
+
self._trace = trace
|
|
102
|
+
|
|
103
|
+
# Get visible params from scenario function
|
|
104
|
+
self._visible_params: set[str] = set()
|
|
105
|
+
self._param_schema: dict[str, Any] = {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"properties": {},
|
|
108
|
+
"required": [],
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if task.env and task.scenario:
|
|
112
|
+
scenario_fn = task.env._scenarios.get(task.scenario)
|
|
113
|
+
if scenario_fn:
|
|
114
|
+
sig = inspect.signature(scenario_fn)
|
|
115
|
+
visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)}
|
|
116
|
+
self._visible_params = set(visible.keys())
|
|
117
|
+
self._param_schema = self._build_schema(visible)
|
|
118
|
+
|
|
119
|
+
tool_name = name or task.scenario or "agent_tool"
|
|
120
|
+
tool_desc = description or f"Run scenario: {task.scenario}"
|
|
121
|
+
|
|
122
|
+
super().__init__(name=tool_name, description=tool_desc)
|
|
123
|
+
|
|
124
|
+
def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]:
|
|
125
|
+
"""Build JSON schema using Pydantic TypeAdapter."""
|
|
126
|
+
from pydantic import TypeAdapter
|
|
127
|
+
|
|
128
|
+
properties: dict[str, Any] = {}
|
|
129
|
+
required: list[str] = []
|
|
130
|
+
|
|
131
|
+
for name, param in params.items():
|
|
132
|
+
if param.annotation is not inspect.Parameter.empty:
|
|
133
|
+
try:
|
|
134
|
+
# Handle string annotations
|
|
135
|
+
annotation = param.annotation
|
|
136
|
+
if isinstance(annotation, str):
|
|
137
|
+
# Try to evaluate the annotation
|
|
138
|
+
try:
|
|
139
|
+
annotation = eval(annotation) # noqa: S307
|
|
140
|
+
except Exception:
|
|
141
|
+
# Fall back to string type but don't skip required handling
|
|
142
|
+
annotation = None
|
|
143
|
+
|
|
144
|
+
if annotation is not None:
|
|
145
|
+
adapter = TypeAdapter(annotation)
|
|
146
|
+
properties[name] = adapter.json_schema()
|
|
147
|
+
else:
|
|
148
|
+
properties[name] = {"type": "string"}
|
|
149
|
+
except Exception:
|
|
150
|
+
properties[name] = {"type": "string"}
|
|
151
|
+
else:
|
|
152
|
+
properties[name] = {"type": "string"}
|
|
153
|
+
|
|
154
|
+
if param.default is inspect.Parameter.empty:
|
|
155
|
+
required.append(name)
|
|
156
|
+
elif param.default is not None:
|
|
157
|
+
properties[name]["default"] = param.default
|
|
158
|
+
|
|
159
|
+
return {"type": "object", "properties": properties, "required": required}
|
|
160
|
+
|
|
161
|
+
@property
|
|
162
|
+
def mcp(self) -> FunctionTool:
|
|
163
|
+
"""Get as FastMCP FunctionTool with filtered schema."""
|
|
164
|
+
if not hasattr(self, "_mcp_tool"):
|
|
165
|
+
# Directly instantiate FunctionTool with our callable and schema
|
|
166
|
+
# This bypasses from_function's signature parsing
|
|
167
|
+
self._mcp_tool = FunctionTool(
|
|
168
|
+
name=self.name,
|
|
169
|
+
description=self.description or "",
|
|
170
|
+
parameters=self._param_schema,
|
|
171
|
+
fn=self._execute_with_args,
|
|
172
|
+
)
|
|
173
|
+
return self._mcp_tool
|
|
174
|
+
|
|
175
|
+
async def _execute_with_args(self, **kwargs: Any) -> ToolResult:
|
|
176
|
+
"""Internal executor that FastMCP calls with parsed arguments."""
|
|
177
|
+
return await self(**kwargs)
|
|
178
|
+
|
|
179
|
+
async def __call__(self, **kwargs: Any) -> ToolResult:
|
|
180
|
+
"""Execute the task with a fresh agent."""
|
|
181
|
+
from hud.eval.context import get_current_trace_id
|
|
182
|
+
from hud.eval.manager import run_eval
|
|
183
|
+
from hud.telemetry.instrument import instrument
|
|
184
|
+
|
|
185
|
+
# Filter to visible params only
|
|
186
|
+
filtered = {k: v for k, v in kwargs.items() if k in self._visible_params}
|
|
187
|
+
|
|
188
|
+
# Merge with template args
|
|
189
|
+
base_args = self._task.args or {}
|
|
190
|
+
task = self._task.model_copy(update={"args": {**base_args, **filtered}})
|
|
191
|
+
|
|
192
|
+
# Use parent trace if available (for hierarchical agents)
|
|
193
|
+
parent_trace_id = get_current_trace_id()
|
|
194
|
+
|
|
195
|
+
# If nested (has parent), skip subagent's enter/exit registration
|
|
196
|
+
# Tool calls are still recorded via the shared trace_id's context
|
|
197
|
+
is_nested = parent_trace_id is not None
|
|
198
|
+
|
|
199
|
+
# Trace if explicitly requested AND not nested (nested uses parent trace)
|
|
200
|
+
should_trace = self._trace and not is_nested
|
|
201
|
+
|
|
202
|
+
# Wrap execution with instrumentation to mark as subagent
|
|
203
|
+
# Platform uses category="subagent" to detect and render subagent tool calls
|
|
204
|
+
@instrument(category="subagent", name=self.name)
|
|
205
|
+
async def _run_subagent() -> ToolResult:
|
|
206
|
+
async with run_eval(
|
|
207
|
+
task,
|
|
208
|
+
trace=should_trace,
|
|
209
|
+
trace_id=parent_trace_id,
|
|
210
|
+
quiet=True,
|
|
211
|
+
) as ctx:
|
|
212
|
+
if self._model:
|
|
213
|
+
from hud.agents import create_agent
|
|
214
|
+
|
|
215
|
+
agent = create_agent(self._model, **self._agent_params)
|
|
216
|
+
else:
|
|
217
|
+
agent = self._agent_cls.create(**self._agent_params) # type: ignore
|
|
218
|
+
|
|
219
|
+
result = await agent.run(ctx)
|
|
220
|
+
content = result.content if hasattr(result, "content") and result.content else ""
|
|
221
|
+
return ToolResult(content=[TextContent(type="text", text=content)])
|
|
222
|
+
|
|
223
|
+
return await _run_subagent()
|