hud-python 0.5.1__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +1 -1
- hud/agents/__init__.py +65 -6
- hud/agents/base.py +33 -15
- hud/agents/claude.py +60 -31
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +15 -26
- hud/agents/gemini_cua.py +6 -17
- hud/agents/misc/response_agent.py +7 -0
- hud/agents/openai.py +16 -29
- hud/agents/openai_chat.py +3 -19
- hud/agents/operator.py +5 -17
- hud/agents/resolver.py +70 -0
- hud/agents/tests/test_claude.py +2 -4
- hud/agents/tests/test_openai.py +2 -1
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +34 -3
- hud/cli/build.py +37 -5
- hud/cli/dev.py +11 -2
- hud/cli/eval.py +51 -39
- hud/cli/flows/init.py +1 -1
- hud/cli/pull.py +1 -1
- hud/cli/push.py +9 -2
- hud/cli/tests/test_build.py +2 -2
- hud/cli/tests/test_push.py +1 -1
- hud/cli/utils/metadata.py +1 -1
- hud/cli/utils/tests/test_metadata.py +1 -1
- hud/clients/mcp_use.py +6 -1
- hud/datasets/loader.py +17 -18
- hud/datasets/runner.py +16 -10
- hud/datasets/tests/test_loader.py +15 -15
- hud/environment/__init__.py +5 -3
- hud/environment/connection.py +58 -6
- hud/environment/connectors/mcp_config.py +29 -1
- hud/environment/environment.py +218 -77
- hud/environment/router.py +175 -24
- hud/environment/scenarios.py +313 -186
- hud/environment/tests/test_connectors.py +10 -23
- hud/environment/tests/test_environment.py +432 -0
- hud/environment/tests/test_local_connectors.py +81 -40
- hud/environment/tests/test_scenarios.py +820 -14
- hud/eval/context.py +63 -10
- hud/eval/instrument.py +4 -2
- hud/eval/manager.py +79 -12
- hud/eval/task.py +36 -4
- hud/eval/tests/test_eval.py +1 -1
- hud/eval/tests/test_task.py +147 -1
- hud/eval/types.py +2 -0
- hud/eval/utils.py +14 -3
- hud/patches/mcp_patches.py +178 -21
- hud/telemetry/instrument.py +8 -1
- hud/telemetry/tests/test_eval_telemetry.py +8 -8
- hud/tools/__init__.py +2 -0
- hud/tools/agent.py +223 -0
- hud/tools/computer/__init__.py +34 -5
- hud/tools/shell.py +3 -3
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/types.py +62 -34
- hud/utils/hud_console.py +30 -17
- hud/utils/strict_schema.py +1 -1
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/METADATA +2 -2
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/RECORD +67 -61
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/WHEEL +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.5.1.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Tests for AgentTool - scenario-to-agent composition."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import inspect
|
|
6
|
+
from unittest.mock import AsyncMock, MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from hud.environment import Environment
|
|
11
|
+
from hud.eval.task import Task
|
|
12
|
+
from hud.tools.agent import AgentTool, _is_eval_only
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestIsEvalOnly:
|
|
16
|
+
"""Tests for _is_eval_only helper function."""
|
|
17
|
+
|
|
18
|
+
def test_required_param_not_eval_only(self) -> None:
|
|
19
|
+
"""Required params (no default) are not eval-only."""
|
|
20
|
+
|
|
21
|
+
def fn(x: str) -> None:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
sig = inspect.signature(fn)
|
|
25
|
+
param = sig.parameters["x"]
|
|
26
|
+
assert not _is_eval_only(param)
|
|
27
|
+
|
|
28
|
+
def test_optional_with_value_not_eval_only(self) -> None:
|
|
29
|
+
"""Optional params with non-None default are not eval-only."""
|
|
30
|
+
|
|
31
|
+
def fn(x: str = "default") -> None:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
sig = inspect.signature(fn)
|
|
35
|
+
param = sig.parameters["x"]
|
|
36
|
+
assert not _is_eval_only(param)
|
|
37
|
+
|
|
38
|
+
def test_optional_none_without_union_not_eval_only(self) -> None:
|
|
39
|
+
"""Optional with None default but no None in type is not eval-only."""
|
|
40
|
+
|
|
41
|
+
def fn(x: str = None) -> None: # type: ignore[assignment] # noqa: RUF013
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
sig = inspect.signature(fn)
|
|
45
|
+
param = sig.parameters["x"]
|
|
46
|
+
assert not _is_eval_only(param)
|
|
47
|
+
|
|
48
|
+
def test_optional_none_with_union_is_eval_only(self) -> None:
|
|
49
|
+
"""Params with `X | None = None` pattern are eval-only."""
|
|
50
|
+
|
|
51
|
+
def fn(x: str | None = None) -> None:
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
sig = inspect.signature(fn)
|
|
55
|
+
param = sig.parameters["x"]
|
|
56
|
+
assert _is_eval_only(param)
|
|
57
|
+
|
|
58
|
+
def test_optional_int_none_is_eval_only(self) -> None:
|
|
59
|
+
"""Works with int | None = None too."""
|
|
60
|
+
|
|
61
|
+
def fn(x: int | None = None) -> None:
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
sig = inspect.signature(fn)
|
|
65
|
+
param = sig.parameters["x"]
|
|
66
|
+
assert _is_eval_only(param)
|
|
67
|
+
|
|
68
|
+
def test_string_annotation_with_none_union(self) -> None:
|
|
69
|
+
"""Handles string annotations like 'str | None'."""
|
|
70
|
+
# Simulate string annotation
|
|
71
|
+
param = inspect.Parameter(
|
|
72
|
+
"x",
|
|
73
|
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
|
74
|
+
default=None,
|
|
75
|
+
annotation="str | None",
|
|
76
|
+
)
|
|
77
|
+
assert _is_eval_only(param)
|
|
78
|
+
|
|
79
|
+
def test_string_annotation_without_none(self) -> None:
|
|
80
|
+
"""String annotations without None are not eval-only."""
|
|
81
|
+
param = inspect.Parameter(
|
|
82
|
+
"x",
|
|
83
|
+
inspect.Parameter.POSITIONAL_OR_KEYWORD,
|
|
84
|
+
default=None,
|
|
85
|
+
annotation="str",
|
|
86
|
+
)
|
|
87
|
+
assert not _is_eval_only(param)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class TestAgentToolInit:
|
|
91
|
+
"""Tests for AgentTool initialization."""
|
|
92
|
+
|
|
93
|
+
def test_requires_model_or_agent(self) -> None:
|
|
94
|
+
"""Must provide either model or agent."""
|
|
95
|
+
task = Task(args={})
|
|
96
|
+
|
|
97
|
+
with pytest.raises(ValueError, match="Must provide either"):
|
|
98
|
+
AgentTool(task)
|
|
99
|
+
|
|
100
|
+
def test_cannot_provide_both_model_and_agent(self) -> None:
|
|
101
|
+
"""Cannot provide both model and agent."""
|
|
102
|
+
task = Task(args={})
|
|
103
|
+
mock_agent = MagicMock()
|
|
104
|
+
|
|
105
|
+
with pytest.raises(ValueError, match="Cannot provide both"):
|
|
106
|
+
AgentTool(task, model="claude", agent=mock_agent) # type: ignore[arg-type]
|
|
107
|
+
|
|
108
|
+
def test_accepts_model_string(self) -> None:
|
|
109
|
+
"""Can create with model string."""
|
|
110
|
+
task = Task(scenario="test", args={})
|
|
111
|
+
tool = AgentTool(task, model="claude")
|
|
112
|
+
|
|
113
|
+
assert tool._model == "claude"
|
|
114
|
+
assert tool._agent_cls is None
|
|
115
|
+
|
|
116
|
+
def test_accepts_agent_class(self) -> None:
|
|
117
|
+
"""Can create with custom agent class."""
|
|
118
|
+
task = Task(scenario="test", args={})
|
|
119
|
+
mock_agent_cls = MagicMock()
|
|
120
|
+
tool = AgentTool(task, agent=mock_agent_cls) # type: ignore[arg-type]
|
|
121
|
+
|
|
122
|
+
assert tool._model is None
|
|
123
|
+
assert tool._agent_cls is mock_agent_cls
|
|
124
|
+
|
|
125
|
+
def test_name_defaults_to_scenario(self) -> None:
|
|
126
|
+
"""Tool name defaults to scenario name."""
|
|
127
|
+
task = Task(scenario="investigate", args={})
|
|
128
|
+
tool = AgentTool(task, model="claude")
|
|
129
|
+
|
|
130
|
+
assert tool.name == "investigate"
|
|
131
|
+
|
|
132
|
+
def test_name_can_be_overridden(self) -> None:
|
|
133
|
+
"""Tool name can be overridden."""
|
|
134
|
+
task = Task(scenario="investigate", args={})
|
|
135
|
+
tool = AgentTool(task, model="claude", name="custom_name")
|
|
136
|
+
|
|
137
|
+
assert tool.name == "custom_name"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class TestAgentToolParamFiltering:
|
|
141
|
+
"""Tests for parameter filtering (eval-only params hidden)."""
|
|
142
|
+
|
|
143
|
+
def test_filters_eval_only_params(self) -> None:
|
|
144
|
+
"""Eval-only params (| None = None) are filtered from visible_params."""
|
|
145
|
+
env = Environment("test")
|
|
146
|
+
|
|
147
|
+
# Use Union syntax for consistency across Python versions
|
|
148
|
+
@env.scenario()
|
|
149
|
+
async def investigate(
|
|
150
|
+
issue_id: str,
|
|
151
|
+
include_traces: bool = True,
|
|
152
|
+
expected_cause: str | None = None, # Eval only
|
|
153
|
+
):
|
|
154
|
+
yield {"task": f"Investigate {issue_id}"}
|
|
155
|
+
|
|
156
|
+
task = env("investigate")
|
|
157
|
+
tool = AgentTool(task, model="claude")
|
|
158
|
+
|
|
159
|
+
# visible_params should only have issue_id and include_traces
|
|
160
|
+
assert "issue_id" in tool._visible_params
|
|
161
|
+
assert "include_traces" in tool._visible_params
|
|
162
|
+
assert "expected_cause" not in tool._visible_params
|
|
163
|
+
|
|
164
|
+
def test_all_required_params_visible(self) -> None:
|
|
165
|
+
"""All required params are visible."""
|
|
166
|
+
env = Environment("test")
|
|
167
|
+
|
|
168
|
+
@env.scenario()
|
|
169
|
+
async def search(query: str, limit: int):
|
|
170
|
+
yield {"task": f"Search: {query}"}
|
|
171
|
+
|
|
172
|
+
task = env("search")
|
|
173
|
+
tool = AgentTool(task, model="claude")
|
|
174
|
+
|
|
175
|
+
assert "query" in tool._visible_params
|
|
176
|
+
assert "limit" in tool._visible_params
|
|
177
|
+
|
|
178
|
+
def test_optional_with_default_visible(self) -> None:
|
|
179
|
+
"""Optional params with non-None defaults are visible."""
|
|
180
|
+
env = Environment("test")
|
|
181
|
+
|
|
182
|
+
@env.scenario()
|
|
183
|
+
async def fetch(url: str, request_timeout: int = 30, retries: int = 3):
|
|
184
|
+
yield {"task": f"Fetch {url}"}
|
|
185
|
+
|
|
186
|
+
task = env("fetch")
|
|
187
|
+
tool = AgentTool(task, model="claude")
|
|
188
|
+
|
|
189
|
+
assert "url" in tool._visible_params
|
|
190
|
+
assert "request_timeout" in tool._visible_params
|
|
191
|
+
assert "retries" in tool._visible_params
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class TestAgentToolSchema:
|
|
195
|
+
"""Tests for JSON schema generation."""
|
|
196
|
+
|
|
197
|
+
def test_builds_json_schema(self) -> None:
|
|
198
|
+
"""Builds proper JSON schema from visible params."""
|
|
199
|
+
env = Environment("test")
|
|
200
|
+
|
|
201
|
+
@env.scenario()
|
|
202
|
+
async def investigate(issue_id: str, verbose: bool = False):
|
|
203
|
+
yield {"task": f"Investigate {issue_id}"}
|
|
204
|
+
|
|
205
|
+
task = env("investigate")
|
|
206
|
+
tool = AgentTool(task, model="claude")
|
|
207
|
+
|
|
208
|
+
schema = tool._param_schema
|
|
209
|
+
assert schema is not None
|
|
210
|
+
assert schema["type"] == "object"
|
|
211
|
+
assert "issue_id" in schema["properties"]
|
|
212
|
+
assert "verbose" in schema["properties"]
|
|
213
|
+
assert "issue_id" in schema["required"]
|
|
214
|
+
assert "verbose" not in schema["required"] # Has default
|
|
215
|
+
|
|
216
|
+
def test_schema_excludes_eval_only(self) -> None:
|
|
217
|
+
"""Schema excludes eval-only params."""
|
|
218
|
+
env = Environment("test")
|
|
219
|
+
|
|
220
|
+
@env.scenario()
|
|
221
|
+
async def check(
|
|
222
|
+
item_id: str,
|
|
223
|
+
expected_status: str | None = None, # Eval only
|
|
224
|
+
):
|
|
225
|
+
yield {"task": f"Check {item_id}"}
|
|
226
|
+
|
|
227
|
+
task = env("check")
|
|
228
|
+
tool = AgentTool(task, model="claude")
|
|
229
|
+
|
|
230
|
+
schema = tool._param_schema
|
|
231
|
+
assert schema is not None
|
|
232
|
+
assert "item_id" in schema["properties"]
|
|
233
|
+
assert "expected_status" not in schema["properties"]
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class TestAgentToolMCP:
|
|
237
|
+
"""Tests for MCP tool integration."""
|
|
238
|
+
|
|
239
|
+
def test_mcp_property_returns_tool(self) -> None:
|
|
240
|
+
"""The mcp property returns a FastMCP FunctionTool."""
|
|
241
|
+
from fastmcp.tools import FunctionTool
|
|
242
|
+
|
|
243
|
+
env = Environment("test")
|
|
244
|
+
|
|
245
|
+
@env.scenario()
|
|
246
|
+
async def greet(name: str):
|
|
247
|
+
yield {"task": f"Greet {name}"}
|
|
248
|
+
|
|
249
|
+
task = env("greet")
|
|
250
|
+
tool = AgentTool(task, model="claude")
|
|
251
|
+
|
|
252
|
+
mcp_tool = tool.mcp
|
|
253
|
+
assert isinstance(mcp_tool, FunctionTool)
|
|
254
|
+
|
|
255
|
+
def test_mcp_has_filtered_parameters(self) -> None:
|
|
256
|
+
"""MCP tool has filtered parameter schema."""
|
|
257
|
+
env = Environment("test")
|
|
258
|
+
|
|
259
|
+
@env.scenario()
|
|
260
|
+
async def analyze(
|
|
261
|
+
data: str,
|
|
262
|
+
expected_result: str | None = None, # Eval only
|
|
263
|
+
):
|
|
264
|
+
yield {"task": f"Analyze {data}"}
|
|
265
|
+
|
|
266
|
+
task = env("analyze")
|
|
267
|
+
tool = AgentTool(task, model="claude")
|
|
268
|
+
|
|
269
|
+
mcp_tool = tool.mcp
|
|
270
|
+
params = mcp_tool.parameters # FunctionTool uses 'parameters'
|
|
271
|
+
|
|
272
|
+
assert "data" in params["properties"]
|
|
273
|
+
assert "expected_result" not in params["properties"]
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class TestAgentToolCall:
|
|
277
|
+
"""Tests for AgentTool.__call__."""
|
|
278
|
+
|
|
279
|
+
@pytest.mark.asyncio
|
|
280
|
+
async def test_filters_kwargs_to_visible_only(self) -> None:
|
|
281
|
+
"""Call filters kwargs to visible params only."""
|
|
282
|
+
# Import modules first so patches work
|
|
283
|
+
import hud.agents
|
|
284
|
+
import hud.eval.manager # noqa: F401
|
|
285
|
+
|
|
286
|
+
env = Environment("test")
|
|
287
|
+
|
|
288
|
+
@env.scenario()
|
|
289
|
+
async def process(item: str, expected: str | None = None):
|
|
290
|
+
yield {"task": f"Process {item}"}
|
|
291
|
+
|
|
292
|
+
task = env("process")
|
|
293
|
+
tool = AgentTool(task, model="claude")
|
|
294
|
+
|
|
295
|
+
# Mock the eval context and agent
|
|
296
|
+
with (
|
|
297
|
+
patch("hud.eval.manager.run_eval") as mock_run_eval,
|
|
298
|
+
patch("hud.agents.create_agent") as mock_create_agent,
|
|
299
|
+
):
|
|
300
|
+
mock_ctx = AsyncMock()
|
|
301
|
+
mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
|
|
302
|
+
mock_ctx.__aexit__ = AsyncMock(return_value=None)
|
|
303
|
+
mock_run_eval.return_value = mock_ctx
|
|
304
|
+
|
|
305
|
+
mock_agent = MagicMock()
|
|
306
|
+
mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
|
|
307
|
+
mock_create_agent.return_value = mock_agent
|
|
308
|
+
|
|
309
|
+
# Call with both visible and eval-only params
|
|
310
|
+
await tool(item="test", expected="should_be_filtered")
|
|
311
|
+
|
|
312
|
+
# Check that task was created with filtered args
|
|
313
|
+
call_args = mock_run_eval.call_args
|
|
314
|
+
task_arg = call_args[0][0]
|
|
315
|
+
assert "item" in task_arg.args
|
|
316
|
+
assert "expected" not in task_arg.args # Filtered out
|
|
317
|
+
|
|
318
|
+
@pytest.mark.asyncio
|
|
319
|
+
async def test_merges_template_args(self) -> None:
|
|
320
|
+
"""Call merges kwargs with template args."""
|
|
321
|
+
# Import modules first so patches work
|
|
322
|
+
import hud.agents
|
|
323
|
+
import hud.eval.manager # noqa: F401
|
|
324
|
+
|
|
325
|
+
env = Environment("test")
|
|
326
|
+
|
|
327
|
+
@env.scenario()
|
|
328
|
+
async def search(query: str, limit: int = 10):
|
|
329
|
+
yield {"task": f"Search {query}"}
|
|
330
|
+
|
|
331
|
+
# Create template with some args pre-filled
|
|
332
|
+
task = env("search", limit=5)
|
|
333
|
+
tool = AgentTool(task, model="claude")
|
|
334
|
+
|
|
335
|
+
with (
|
|
336
|
+
patch("hud.eval.manager.run_eval") as mock_run_eval,
|
|
337
|
+
patch("hud.agents.create_agent") as mock_create_agent,
|
|
338
|
+
):
|
|
339
|
+
mock_ctx = AsyncMock()
|
|
340
|
+
mock_ctx.__aenter__ = AsyncMock(return_value=mock_ctx)
|
|
341
|
+
mock_ctx.__aexit__ = AsyncMock(return_value=None)
|
|
342
|
+
mock_run_eval.return_value = mock_ctx
|
|
343
|
+
|
|
344
|
+
mock_agent = MagicMock()
|
|
345
|
+
mock_agent.run = AsyncMock(return_value=MagicMock(content="result"))
|
|
346
|
+
mock_create_agent.return_value = mock_agent
|
|
347
|
+
|
|
348
|
+
# Call with additional arg
|
|
349
|
+
await tool(query="test query")
|
|
350
|
+
|
|
351
|
+
# Check merged args
|
|
352
|
+
call_args = mock_run_eval.call_args
|
|
353
|
+
task_arg = call_args[0][0]
|
|
354
|
+
assert task_arg.args["query"] == "test query"
|
|
355
|
+
assert task_arg.args["limit"] == 5 # From template
|
hud/types.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import Any, Literal
|
|
|
8
8
|
|
|
9
9
|
import mcp.types as types
|
|
10
10
|
from mcp.types import CallToolRequestParams, CallToolResult
|
|
11
|
-
from pydantic import
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
12
|
|
|
13
13
|
from hud.settings import settings
|
|
14
14
|
from hud.utils.env import resolve_env_vars as _resolve_env_vars
|
|
@@ -31,59 +31,87 @@ class AgentType(str, Enum):
|
|
|
31
31
|
|
|
32
32
|
@property
|
|
33
33
|
def cls(self) -> type:
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
from hud.agents.gemini import GeminiAgent
|
|
37
|
-
from hud.agents.gemini_cua import GeminiCUAAgent
|
|
38
|
-
from hud.agents.openai_chat import OpenAIChatAgent
|
|
34
|
+
if self == AgentType.CLAUDE:
|
|
35
|
+
from hud.agents.claude import ClaudeAgent
|
|
39
36
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
37
|
+
return ClaudeAgent
|
|
38
|
+
elif self == AgentType.OPENAI:
|
|
39
|
+
from hud.agents import OpenAIAgent
|
|
40
|
+
|
|
41
|
+
return OpenAIAgent
|
|
42
|
+
elif self == AgentType.OPERATOR:
|
|
43
|
+
from hud.agents import OperatorAgent
|
|
44
|
+
|
|
45
|
+
return OperatorAgent
|
|
46
|
+
elif self == AgentType.GEMINI:
|
|
47
|
+
from hud.agents.gemini import GeminiAgent
|
|
48
|
+
|
|
49
|
+
return GeminiAgent
|
|
50
|
+
elif self == AgentType.GEMINI_CUA:
|
|
51
|
+
from hud.agents.gemini_cua import GeminiCUAAgent
|
|
52
|
+
|
|
53
|
+
return GeminiCUAAgent
|
|
54
|
+
elif self == AgentType.OPENAI_COMPATIBLE:
|
|
55
|
+
from hud.agents.openai_chat import OpenAIChatAgent
|
|
56
|
+
|
|
57
|
+
return OpenAIChatAgent
|
|
58
|
+
elif self == AgentType.INTEGRATION_TEST:
|
|
49
59
|
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
50
60
|
|
|
51
61
|
return IntegrationTestRunner
|
|
52
|
-
|
|
62
|
+
else:
|
|
53
63
|
raise ValueError(f"Unsupported agent type: {self}")
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def config_cls(self) -> type:
|
|
67
|
+
"""Get config class without importing agent (avoids SDK dependency)."""
|
|
68
|
+
from hud.agents.types import (
|
|
69
|
+
ClaudeConfig,
|
|
70
|
+
GeminiConfig,
|
|
71
|
+
GeminiCUAConfig,
|
|
72
|
+
OpenAIChatConfig,
|
|
73
|
+
OpenAIConfig,
|
|
74
|
+
OperatorConfig,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
mapping: dict[AgentType, type] = {
|
|
78
|
+
AgentType.CLAUDE: ClaudeConfig,
|
|
79
|
+
AgentType.OPENAI: OpenAIConfig,
|
|
80
|
+
AgentType.OPERATOR: OperatorConfig,
|
|
81
|
+
AgentType.GEMINI: GeminiConfig,
|
|
82
|
+
AgentType.GEMINI_CUA: GeminiCUAConfig,
|
|
83
|
+
AgentType.OPENAI_COMPATIBLE: OpenAIChatConfig,
|
|
84
|
+
AgentType.INTEGRATION_TEST: BaseAgentConfig,
|
|
85
|
+
}
|
|
86
|
+
if self not in mapping:
|
|
87
|
+
raise ValueError(f"Unsupported agent type for config: {self}")
|
|
54
88
|
return mapping[self]
|
|
55
89
|
|
|
56
90
|
|
|
57
91
|
class BaseAgentConfig(BaseModel):
|
|
58
92
|
"""Agent configuration for LLM-specific settings.
|
|
59
93
|
|
|
60
|
-
Note: allowed_tools, disallowed_tools, append_setup_output,
|
|
61
|
-
are kept for backwards compatibility with v4 task configs
|
|
62
|
-
at the agent level. These should be configured on the
|
|
94
|
+
Note: allowed_tools, disallowed_tools, response_tool_name, append_setup_output,
|
|
95
|
+
and initial_screenshot are kept for backwards compatibility with v4 task configs
|
|
96
|
+
but are no longer applied at the agent level. These should be configured on the
|
|
97
|
+
Environment/Task instead.
|
|
63
98
|
"""
|
|
64
99
|
|
|
65
100
|
model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid", populate_by_name=True)
|
|
66
101
|
|
|
67
|
-
# Model identifier - use 'model' (preferred) or 'checkpoint_name' (alias)
|
|
68
|
-
model: str | None = Field(
|
|
69
|
-
default=None, validation_alias=AliasChoices("model", "checkpoint_name")
|
|
70
|
-
)
|
|
71
|
-
model_name: str = "Agent" # Human-readable display name
|
|
72
|
-
|
|
73
102
|
# LLM-specific setting
|
|
74
103
|
system_prompt: str | None = None
|
|
75
104
|
|
|
76
|
-
# Deprecated: kept for backwards compat with v4 task configs
|
|
105
|
+
# Deprecated: kept for backwards compat with v4 task configs
|
|
106
|
+
# allowed_tools/disallowed_tools are applied at Environment level
|
|
107
|
+
# append_setup_output is applied by EvalContext -> agent
|
|
108
|
+
# response_tool_name and initial_screenshot are parsed but NOT implemented
|
|
77
109
|
allowed_tools: list[str] | None = None
|
|
78
110
|
disallowed_tools: list[str] | None = None
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
@property
|
|
84
|
-
def checkpoint_name(self) -> str | None:
|
|
85
|
-
"""Alias for model (for backwards compatibility)."""
|
|
86
|
-
return self.model
|
|
111
|
+
response_tool_name: str | None = None # Not implemented
|
|
112
|
+
append_setup_output: bool = False
|
|
113
|
+
append_setup_tool: bool = False # Alias for append_setup_output
|
|
114
|
+
initial_screenshot: bool = False # Not implemented
|
|
87
115
|
|
|
88
116
|
|
|
89
117
|
class LegacyTask(BaseModel):
|
hud/utils/hud_console.py
CHANGED
|
@@ -21,6 +21,7 @@ import traceback
|
|
|
21
21
|
from typing import TYPE_CHECKING, Any, Literal, Self
|
|
22
22
|
|
|
23
23
|
from rich.console import Console
|
|
24
|
+
from rich.markup import escape
|
|
24
25
|
from rich.panel import Panel
|
|
25
26
|
from rich.table import Table
|
|
26
27
|
|
|
@@ -95,7 +96,7 @@ class HUDConsole:
|
|
|
95
96
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
96
97
|
"""
|
|
97
98
|
console = self._stderr_console if stderr else self._stdout_console
|
|
98
|
-
console.print(f"[{GREEN}]✅ {message}[/{GREEN}]")
|
|
99
|
+
console.print(f"[{GREEN}]✅ {escape(message)}[/{GREEN}]")
|
|
99
100
|
|
|
100
101
|
def error(self, message: str, stderr: bool = True) -> None:
|
|
101
102
|
"""Print an error message.
|
|
@@ -106,10 +107,12 @@ class HUDConsole:
|
|
|
106
107
|
"""
|
|
107
108
|
console = self._stderr_console if stderr else self._stdout_console
|
|
108
109
|
tb = traceback.format_exc()
|
|
110
|
+
escaped_message = escape(message)
|
|
109
111
|
if "NoneType: None" not in tb:
|
|
110
|
-
|
|
112
|
+
escaped_tb = escape(tb)
|
|
113
|
+
console.print(f"[{RED} not bold]❌ {escaped_message}\n{escaped_tb}[/{RED} not bold]")
|
|
111
114
|
else:
|
|
112
|
-
console.print(f"[{RED} not bold]❌ {
|
|
115
|
+
console.print(f"[{RED} not bold]❌ {escaped_message}[/{RED} not bold]")
|
|
113
116
|
|
|
114
117
|
def warning(self, message: str, stderr: bool = True) -> None:
|
|
115
118
|
"""Print a warning message.
|
|
@@ -119,7 +122,7 @@ class HUDConsole:
|
|
|
119
122
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
120
123
|
"""
|
|
121
124
|
console = self._stderr_console if stderr else self._stdout_console
|
|
122
|
-
console.print(f"⚠️ [{YELLOW} not bold]{message}[/{YELLOW} not bold]")
|
|
125
|
+
console.print(f"⚠️ [{YELLOW} not bold]{escape(message)}[/{YELLOW} not bold]")
|
|
123
126
|
|
|
124
127
|
def info(self, message: str, stderr: bool = True) -> None:
|
|
125
128
|
"""Print an info message.
|
|
@@ -129,7 +132,7 @@ class HUDConsole:
|
|
|
129
132
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
130
133
|
"""
|
|
131
134
|
console = self._stderr_console if stderr else self._stdout_console
|
|
132
|
-
console.print(f"[{TEXT} not bold]{message}[/{TEXT} not bold]")
|
|
135
|
+
console.print(f"[{TEXT} not bold]{escape(message)}[/{TEXT} not bold]")
|
|
133
136
|
|
|
134
137
|
def print(self, message: str, stderr: bool = True) -> None:
|
|
135
138
|
"""Print a message.
|
|
@@ -151,7 +154,7 @@ class HUDConsole:
|
|
|
151
154
|
"""
|
|
152
155
|
console = self._stderr_console if stderr else self._stdout_console
|
|
153
156
|
console.print(
|
|
154
|
-
f"[{DIM} not bold][default]{label}[/default][/{DIM} not bold] [default]{value}[/default]" # noqa: E501
|
|
157
|
+
f"[{DIM} not bold][default]{escape(label)}[/default][/{DIM} not bold] [default]{escape(value)}[/default]" # noqa: E501
|
|
155
158
|
)
|
|
156
159
|
|
|
157
160
|
def link(self, url: str, stderr: bool = True) -> None:
|
|
@@ -162,7 +165,7 @@ class HUDConsole:
|
|
|
162
165
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
163
166
|
"""
|
|
164
167
|
console = self._stderr_console if stderr else self._stdout_console
|
|
165
|
-
console.print(f"[{SECONDARY} underline]{url}[/{SECONDARY} underline]")
|
|
168
|
+
console.print(f"[{SECONDARY} underline]{escape(url)}[/{SECONDARY} underline]")
|
|
166
169
|
|
|
167
170
|
def json_config(self, json_str: str, stderr: bool = True) -> None:
|
|
168
171
|
"""Print JSON configuration with neutral theme.
|
|
@@ -173,7 +176,7 @@ class HUDConsole:
|
|
|
173
176
|
"""
|
|
174
177
|
# Print JSON with neutral grey text
|
|
175
178
|
console = self._stderr_console if stderr else self._stdout_console
|
|
176
|
-
console.print(f"[{TEXT}]{json_str}[/{TEXT}]")
|
|
179
|
+
console.print(f"[{TEXT}]{escape(json_str)}[/{TEXT}]")
|
|
177
180
|
|
|
178
181
|
def key_value_table(
|
|
179
182
|
self, data: dict[str, str | int | float], show_header: bool = False, stderr: bool = True
|
|
@@ -203,7 +206,7 @@ class HUDConsole:
|
|
|
203
206
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
204
207
|
"""
|
|
205
208
|
console = self._stderr_console if stderr else self._stdout_console
|
|
206
|
-
console.print(f"[{DIM}]{message}[/{DIM}]")
|
|
209
|
+
console.print(f"[{DIM}]{escape(message)}[/{DIM}]")
|
|
207
210
|
|
|
208
211
|
def phase(self, phase_num: int, title: str, stderr: bool = True) -> None:
|
|
209
212
|
"""Print a phase header (for debug command).
|
|
@@ -236,7 +239,7 @@ class HUDConsole:
|
|
|
236
239
|
stderr: If True, output to stderr (default), otherwise stdout
|
|
237
240
|
"""
|
|
238
241
|
console = self._stderr_console if stderr else self._stdout_console
|
|
239
|
-
console.print(f"[rgb(181,137,0)]💡 Hint: {hint}[/rgb(181,137,0)]")
|
|
242
|
+
console.print(f"[rgb(181,137,0)]💡 Hint: {escape(hint)}[/rgb(181,137,0)]")
|
|
240
243
|
|
|
241
244
|
def status_item(
|
|
242
245
|
self,
|
|
@@ -265,10 +268,14 @@ class HUDConsole:
|
|
|
265
268
|
indicator = indicators.get(status, indicators["info"])
|
|
266
269
|
console = self._stderr_console if stderr else self._stdout_console
|
|
267
270
|
|
|
271
|
+
escaped_label = escape(label)
|
|
272
|
+
escaped_value = escape(value)
|
|
268
273
|
if primary:
|
|
269
|
-
console.print(
|
|
274
|
+
console.print(
|
|
275
|
+
f"{indicator} {escaped_label}: [bold {SECONDARY}]{escaped_value}[/bold {SECONDARY}]"
|
|
276
|
+
)
|
|
270
277
|
else:
|
|
271
|
-
console.print(f"{indicator} {
|
|
278
|
+
console.print(f"{indicator} {escaped_label}: [{TEXT}]{escaped_value}[/{TEXT}]")
|
|
272
279
|
|
|
273
280
|
def command_example(
|
|
274
281
|
self, command: str, description: str | None = None, stderr: bool = True
|
|
@@ -546,7 +553,12 @@ class HUDConsole:
|
|
|
546
553
|
except (TypeError, ValueError):
|
|
547
554
|
args_str = str(arguments)[:60]
|
|
548
555
|
|
|
549
|
-
|
|
556
|
+
escaped_name = escape(name)
|
|
557
|
+
escaped_args = escape(args_str)
|
|
558
|
+
return (
|
|
559
|
+
f"[{GOLD}]→[/{GOLD}] [bold {TEXT}]{escaped_name}[/bold {TEXT}]"
|
|
560
|
+
f"[{DIM}]({escaped_args})[/{DIM}]"
|
|
561
|
+
)
|
|
550
562
|
|
|
551
563
|
def format_tool_result(self, content: str, is_error: bool = False) -> str:
|
|
552
564
|
"""Format a tool result in compact HUD style.
|
|
@@ -562,11 +574,12 @@ class HUDConsole:
|
|
|
562
574
|
if len(content) > 80:
|
|
563
575
|
content = content[:77] + "..."
|
|
564
576
|
|
|
577
|
+
escaped_content = escape(content)
|
|
565
578
|
# Format with status using HUD colors
|
|
566
579
|
if is_error:
|
|
567
|
-
return f" [{RED}]✗[/{RED}] [{DIM}]{
|
|
580
|
+
return f" [{RED}]✗[/{RED}] [{DIM}]{escaped_content}[/{DIM}]"
|
|
568
581
|
else:
|
|
569
|
-
return f" [{GREEN}]✓[/{GREEN}] [{TEXT}]{
|
|
582
|
+
return f" [{GREEN}]✓[/{GREEN}] [{TEXT}]{escaped_content}[/{TEXT}]"
|
|
570
583
|
|
|
571
584
|
def confirm(self, message: str, default: bool = True) -> bool:
|
|
572
585
|
"""Print a confirmation message.
|
|
@@ -590,12 +603,12 @@ class HUDConsole:
|
|
|
590
603
|
stderr: If True, output to stderr
|
|
591
604
|
"""
|
|
592
605
|
console = self._stderr_console if stderr else self._stdout_console
|
|
593
|
-
console.print(f"[{color}]{symbol}[/{color}] {message}")
|
|
606
|
+
console.print(f"[{color}]{symbol}[/{color}] {escape(message)}")
|
|
594
607
|
|
|
595
608
|
def detail(self, message: str, stderr: bool = True) -> None:
|
|
596
609
|
"""Print an indented detail line with gold pointer symbol."""
|
|
597
610
|
console = self._stderr_console if stderr else self._stdout_console
|
|
598
|
-
console.print(f" [{GOLD}]{Symbols.ITEM}[/{GOLD}] {message}")
|
|
611
|
+
console.print(f" [{GOLD}]{Symbols.ITEM}[/{GOLD}] {escape(message)}")
|
|
599
612
|
|
|
600
613
|
def flow(self, message: str, stderr: bool = True) -> None:
|
|
601
614
|
"""Print a flow/transition message with wave symbol."""
|
hud/utils/strict_schema.py
CHANGED
|
@@ -118,7 +118,7 @@ def _ensure_strict_json_schema(
|
|
|
118
118
|
if "default" in json_schema:
|
|
119
119
|
json_schema.pop("default")
|
|
120
120
|
|
|
121
|
-
for keyword in ("title", "examples"):
|
|
121
|
+
for keyword in ("title", "examples", "format"):
|
|
122
122
|
json_schema.pop(keyword, None)
|
|
123
123
|
|
|
124
124
|
ref = json_schema.get("$ref")
|
hud/utils/tests/test_version.py
CHANGED
hud/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hud-python
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.13
|
|
4
4
|
Summary: SDK for the HUD platform.
|
|
5
5
|
Project-URL: Homepage, https://github.com/hud-evals/hud-python
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
|
|
@@ -91,7 +91,7 @@ Requires-Dist: pyright==1.1.407; extra == 'dev'
|
|
|
91
91
|
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
92
92
|
Requires-Dist: pytest-cov; extra == 'dev'
|
|
93
93
|
Requires-Dist: pytest-mock; extra == 'dev'
|
|
94
|
-
Requires-Dist: pytest
|
|
94
|
+
Requires-Dist: pytest>=8.1.1; extra == 'dev'
|
|
95
95
|
Requires-Dist: ruff>=0.11.8; extra == 'dev'
|
|
96
96
|
Requires-Dist: tornado>=6.5.2; extra == 'dev'
|
|
97
97
|
Description-Content-Type: text/markdown
|