hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/grounded_openai.py
CHANGED
|
@@ -3,96 +3,91 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
-
from typing import Any, ClassVar
|
|
6
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
7
|
+
|
|
8
|
+
from pydantic import ConfigDict, field_validator
|
|
7
9
|
|
|
8
|
-
from hud import instrument
|
|
9
10
|
from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
|
|
10
11
|
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
-
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
"""Initialize the agent and create the grounded tool with mcp_client."""
|
|
89
|
-
# Call parent initialization first
|
|
90
|
-
await super().initialize(task)
|
|
91
|
-
|
|
92
|
-
if self.mcp_client is None:
|
|
93
|
-
raise ValueError("mcp_client must be initialized before creating grounded tool")
|
|
12
|
+
from hud.utils.types import with_signature
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from hud.types import BaseAgentConfig
|
|
16
|
+
from .base import BaseCreateParams
|
|
17
|
+
from .openai_chat import OpenAIChatAgent, OpenAIChatConfig
|
|
18
|
+
|
|
19
|
+
DEFAULT_GROUNDED_PROMPT = (
|
|
20
|
+
"You are a helpful AI assistant that can control the computer through visual "
|
|
21
|
+
"interaction.\n\n"
|
|
22
|
+
"IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
|
|
23
|
+
"1. First, describe what you see on the screen.\n"
|
|
24
|
+
"2. Explain what you plan to do and why.\n"
|
|
25
|
+
"3. Then use the computer tool with natural language descriptions.\n\n"
|
|
26
|
+
"Use descriptive element descriptions:\n"
|
|
27
|
+
'- Colors ("red button", "blue link")\n'
|
|
28
|
+
'- Position ("top right corner", "left sidebar")\n'
|
|
29
|
+
'- Text content ("Submit button", "Login link")\n'
|
|
30
|
+
'- Element type ("text field", "dropdown")'
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class GroundedOpenAIConfig(OpenAIChatConfig):
|
|
35
|
+
"""Configuration for grounded OpenAI chat agent."""
|
|
36
|
+
|
|
37
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
38
|
+
|
|
39
|
+
grounder_config: GrounderConfig
|
|
40
|
+
model: str = "gpt-4o-mini"
|
|
41
|
+
allowed_tools: list[str] | None = None # Default set in validator
|
|
42
|
+
append_setup_output: bool = False
|
|
43
|
+
system_prompt: str | None = DEFAULT_GROUNDED_PROMPT
|
|
44
|
+
|
|
45
|
+
@field_validator("grounder_config", mode="before")
|
|
46
|
+
@classmethod
|
|
47
|
+
def _coerce_grounder_config(cls, value: GrounderConfig | dict[str, Any]) -> GrounderConfig:
|
|
48
|
+
if isinstance(value, GrounderConfig):
|
|
49
|
+
return value
|
|
50
|
+
if isinstance(value, dict):
|
|
51
|
+
return GrounderConfig(**value)
|
|
52
|
+
|
|
53
|
+
@field_validator("allowed_tools", mode="before")
|
|
54
|
+
@classmethod
|
|
55
|
+
def _default_allowed_tools(cls, value: list[str] | None) -> list[str] | None:
|
|
56
|
+
if value is None:
|
|
57
|
+
return ["computer"]
|
|
58
|
+
return value
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class GroundedOpenAICreateParams(BaseCreateParams, GroundedOpenAIConfig):
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class GroundedOpenAIChatAgent(OpenAIChatAgent):
|
|
66
|
+
"""OpenAI chat agent that pipes 'computer' tool calls through a vision grounder."""
|
|
67
|
+
|
|
68
|
+
metadata: ClassVar[dict[str, Any] | None] = None
|
|
69
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = GroundedOpenAIConfig
|
|
70
|
+
|
|
71
|
+
@with_signature(GroundedOpenAICreateParams)
|
|
72
|
+
@classmethod
|
|
73
|
+
def create(cls, **kwargs: Any) -> GroundedOpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
74
|
+
from .base import MCPAgent
|
|
75
|
+
|
|
76
|
+
return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
|
|
77
|
+
|
|
78
|
+
def __init__(self, params: GroundedOpenAICreateParams | None = None, **kwargs: Any) -> None:
|
|
79
|
+
super().__init__(params, **kwargs) # type: ignore[arg-type]
|
|
80
|
+
self.config: GroundedOpenAIConfig # type: ignore[assignment]
|
|
81
|
+
|
|
82
|
+
self.grounder = Grounder(self.config.grounder_config)
|
|
83
|
+
self.grounded_tool: GroundedComputerTool | None = None
|
|
84
|
+
|
|
85
|
+
def _on_tools_ready(self) -> None:
|
|
86
|
+
"""Create the grounded tool after context is bound."""
|
|
87
|
+
if self.ctx is None:
|
|
88
|
+
raise ValueError("ctx must be set before creating grounded tool")
|
|
94
89
|
self.grounded_tool = GroundedComputerTool(
|
|
95
|
-
grounder=self.grounder,
|
|
90
|
+
grounder=self.grounder, ctx=self.ctx, computer_tool_name="computer"
|
|
96
91
|
)
|
|
97
92
|
|
|
98
93
|
def get_tool_schemas(self) -> list[Any]:
|
|
@@ -108,11 +103,6 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
108
103
|
return []
|
|
109
104
|
return [self.grounded_tool.get_openai_tool_schema()]
|
|
110
105
|
|
|
111
|
-
@instrument(
|
|
112
|
-
span_type="agent",
|
|
113
|
-
record_args=False,
|
|
114
|
-
record_result=True,
|
|
115
|
-
)
|
|
116
106
|
async def get_response(self, messages: Any) -> AgentResponse:
|
|
117
107
|
"""Get response from the planning model and handle grounded tool calls.
|
|
118
108
|
|
|
@@ -142,11 +132,9 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
142
132
|
)
|
|
143
133
|
|
|
144
134
|
if not has_image:
|
|
145
|
-
if self.
|
|
146
|
-
raise ValueError("
|
|
147
|
-
screenshot_result = await self.
|
|
148
|
-
MCPToolCall(name="computer", arguments={"action": "screenshot"})
|
|
149
|
-
)
|
|
135
|
+
if self.ctx is None:
|
|
136
|
+
raise ValueError("ctx is not initialized")
|
|
137
|
+
screenshot_result = await self.ctx.call_tool(("computer", {"action": "screenshot"}))
|
|
150
138
|
|
|
151
139
|
for block in screenshot_result.content:
|
|
152
140
|
# Check for ImageContent type from MCP
|
|
@@ -169,8 +157,8 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
169
157
|
protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
|
|
170
158
|
extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
|
|
171
159
|
|
|
172
|
-
response = await self.oai.chat.completions.create(
|
|
173
|
-
model=self.
|
|
160
|
+
response = await self.oai.chat.completions.create( # type: ignore
|
|
161
|
+
model=self.config.model,
|
|
174
162
|
messages=messages,
|
|
175
163
|
tools=tool_schemas,
|
|
176
164
|
parallel_tool_calls=False,
|
|
@@ -193,6 +181,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
193
181
|
if not msg.tool_calls:
|
|
194
182
|
return AgentResponse(
|
|
195
183
|
content=msg.content or "",
|
|
184
|
+
reasoning=msg.reasoning_content,
|
|
196
185
|
tool_calls=[],
|
|
197
186
|
done=choice.finish_reason in ("stop", "length"),
|
|
198
187
|
raw=response,
|
|
@@ -203,6 +192,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
203
192
|
if tc.function.name != "computer":
|
|
204
193
|
return AgentResponse(
|
|
205
194
|
content=f"Error: Model called unexpected tool '{tc.function.name}'",
|
|
195
|
+
reasoning=msg.reasoning_content,
|
|
206
196
|
tool_calls=[],
|
|
207
197
|
done=True,
|
|
208
198
|
raw=response,
|
|
@@ -213,13 +203,21 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
|
|
|
213
203
|
args = json.loads(tc.function.arguments or "{}")
|
|
214
204
|
except json.JSONDecodeError:
|
|
215
205
|
return AgentResponse(
|
|
216
|
-
content="Error: Invalid tool arguments",
|
|
206
|
+
content="Error: Invalid tool arguments",
|
|
207
|
+
reasoning=msg.reasoning_content,
|
|
208
|
+
tool_calls=[],
|
|
209
|
+
done=True,
|
|
210
|
+
raw=response,
|
|
217
211
|
)
|
|
218
212
|
|
|
219
213
|
tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
|
|
220
214
|
|
|
221
215
|
return AgentResponse(
|
|
222
|
-
content=msg.content or "",
|
|
216
|
+
content=msg.content or "",
|
|
217
|
+
reasoning=msg.reasoning_content,
|
|
218
|
+
tool_calls=[tool_call],
|
|
219
|
+
done=False,
|
|
220
|
+
raw=response,
|
|
223
221
|
)
|
|
224
222
|
|
|
225
223
|
async def call_tools(
|
|
@@ -1,41 +1,72 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any
|
|
3
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
4
4
|
|
|
5
|
-
from hud.agents.base import MCPAgent
|
|
6
|
-
from hud.types import AgentResponse,
|
|
5
|
+
from hud.agents.base import MCPAgent
|
|
6
|
+
from hud.types import AgentResponse, BaseAgentConfig, Trace
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from hud.eval.context import EvalContext
|
|
7
10
|
|
|
8
11
|
|
|
9
12
|
class IntegrationTestRunner(MCPAgent):
|
|
13
|
+
"""Special agent that runs integration tests by executing tools directly.
|
|
14
|
+
|
|
15
|
+
Unlike regular agents, this doesn't run an LLM loop - it executes
|
|
16
|
+
integration_test_tool and evaluate_tool in sequence to verify tool behavior.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
metadata: ClassVar[dict[str, Any] | None] = {}
|
|
20
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
|
|
21
|
+
|
|
10
22
|
def __init__(self, **kwargs: Any) -> None:
|
|
11
23
|
kwargs["auto_trace"] = False
|
|
12
24
|
super().__init__(**kwargs)
|
|
13
|
-
self.metadata = {}
|
|
14
25
|
|
|
15
|
-
async def run(
|
|
26
|
+
async def run(
|
|
27
|
+
self,
|
|
28
|
+
ctx: EvalContext,
|
|
29
|
+
*,
|
|
30
|
+
max_steps: int = 10,
|
|
31
|
+
) -> Trace:
|
|
32
|
+
"""Run integration test by executing tools directly.
|
|
33
|
+
|
|
34
|
+
The EvalContext should have integration_test_tool and evaluate_tool
|
|
35
|
+
configured in its metadata or environment setup.
|
|
36
|
+
"""
|
|
37
|
+
from hud.eval.context import EvalContext
|
|
38
|
+
|
|
39
|
+
if not isinstance(ctx, EvalContext):
|
|
40
|
+
raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
|
|
41
|
+
|
|
42
|
+
self.ctx = ctx
|
|
43
|
+
|
|
16
44
|
try:
|
|
17
|
-
# Initialize
|
|
18
|
-
|
|
45
|
+
# Initialize tools from context
|
|
46
|
+
if not self._initialized:
|
|
47
|
+
await self._initialize_from_ctx(ctx)
|
|
19
48
|
|
|
20
|
-
|
|
21
|
-
|
|
49
|
+
self.console.info(f"Full system prompt: {self.system_prompt}")
|
|
50
|
+
|
|
51
|
+
# For integration tests, we expect the context's environment to have
|
|
52
|
+
# _setup_calls, _integration_test_calls, and _evaluate_calls configured
|
|
53
|
+
env = ctx
|
|
54
|
+
|
|
55
|
+
# Run integration test tool (stored in environment metadata or separate list)
|
|
56
|
+
integration_test_calls = getattr(env, "_integration_test_calls", [])
|
|
57
|
+
if not integration_test_calls:
|
|
22
58
|
raise ValueError(
|
|
23
|
-
"--integration-test requires
|
|
59
|
+
"--integration-test requires integration_test_tool to be configured"
|
|
24
60
|
)
|
|
25
|
-
elif not getattr(task, "evaluate_tool", None):
|
|
26
|
-
raise ValueError("--integration-test requires task.evaluate_tool (single call)")
|
|
27
|
-
|
|
28
|
-
if task.setup_tool:
|
|
29
|
-
_ = await self.call_tools(task.setup_tool)
|
|
30
61
|
|
|
31
|
-
|
|
32
|
-
|
|
62
|
+
for name, args in integration_test_calls:
|
|
63
|
+
await ctx.call_tool((name, args))
|
|
33
64
|
|
|
34
|
-
|
|
65
|
+
# The evaluate phase runs automatically when ctx exits,
|
|
66
|
+
# but we can also get the reward from ctx.reward after
|
|
67
|
+
return Trace(done=True, reward=ctx.reward or 0.0, info={})
|
|
35
68
|
|
|
36
|
-
return Trace(done=True, reward=reward, info={})
|
|
37
69
|
finally:
|
|
38
|
-
# Ensure resources are cleaned up so the CLI can exit cleanly
|
|
39
70
|
await self._cleanup()
|
|
40
71
|
|
|
41
72
|
# Stub implementations to satisfy abstract base class; not used in --integration-test path
|
|
@@ -1,14 +1,37 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import logging
|
|
4
4
|
from typing import Literal
|
|
5
5
|
|
|
6
6
|
from openai import AsyncOpenAI
|
|
7
7
|
|
|
8
8
|
from hud.settings import settings
|
|
9
|
+
from hud.telemetry import instrument
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
9
12
|
|
|
10
13
|
ResponseType = Literal["STOP", "CONTINUE"]
|
|
11
14
|
|
|
15
|
+
DEFAULT_SYSTEM_PROMPT = """\
|
|
16
|
+
You are an assistant that helps determine the appropriate response to an agent's message.
|
|
17
|
+
|
|
18
|
+
You will receive messages from an agent that is performing tasks for a user.
|
|
19
|
+
Your job is to analyze these messages and respond with one of the following:
|
|
20
|
+
|
|
21
|
+
- STOP: If the agent indicates it has successfully completed a task or is stuck,
|
|
22
|
+
struggling or says it cannot complete the task, even if phrased as a question
|
|
23
|
+
like "I have entered the right values into this form. Would you like me to do
|
|
24
|
+
anything else?" or "Here is the website. Is there any other information you
|
|
25
|
+
need?" or if the agent has strongly determined it wants to stop the task like
|
|
26
|
+
"The task is infeasible. Can I help you with something else?"
|
|
27
|
+
|
|
28
|
+
- CONTINUE: If the agent is asking for clarification before proceeding with a task
|
|
29
|
+
like "I'm about to clear cookies from this website. Would you like me to proceed?"
|
|
30
|
+
or "I've entered the right values into this form. Would you like me to continue
|
|
31
|
+
with the rest of the task?"
|
|
32
|
+
|
|
33
|
+
Respond ONLY with one of these two options."""
|
|
34
|
+
|
|
12
35
|
|
|
13
36
|
class ResponseAgent:
|
|
14
37
|
"""
|
|
@@ -17,49 +40,36 @@ class ResponseAgent:
|
|
|
17
40
|
"""
|
|
18
41
|
|
|
19
42
|
def __init__(
|
|
20
|
-
self,
|
|
43
|
+
self,
|
|
44
|
+
model: str = "gpt-4o",
|
|
45
|
+
system_prompt: str | None = None,
|
|
21
46
|
) -> None:
|
|
22
47
|
"""
|
|
23
48
|
Initialize the ResponseAgent.
|
|
24
49
|
|
|
25
50
|
Args:
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
system_prompt:
|
|
51
|
+
model: The model to use via HUD inference gateway (default: "gpt-4o").
|
|
52
|
+
Supports any model available through inference.hud.ai.
|
|
53
|
+
system_prompt: Optional custom system prompt for determining responses.
|
|
29
54
|
"""
|
|
30
|
-
|
|
31
|
-
if not
|
|
55
|
+
api_key = settings.api_key
|
|
56
|
+
if not api_key:
|
|
32
57
|
raise ValueError(
|
|
33
|
-
"
|
|
58
|
+
"HUD API key is required for auto_respond. Set HUD_API_KEY environment variable."
|
|
34
59
|
)
|
|
35
60
|
|
|
36
|
-
self.client = AsyncOpenAI(
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
self.system_prompt = (
|
|
40
|
-
system_prompt
|
|
41
|
-
or """
|
|
42
|
-
You are an assistant that helps determine the appropriate response to an agent's message.
|
|
43
|
-
|
|
44
|
-
You will receive messages from an agent that is performing tasks for a user.
|
|
45
|
-
Your job is to analyze these messages and respond with one of the following:
|
|
46
|
-
|
|
47
|
-
- STOP: If the agent indicates it has successfully completed a task or is stuck,
|
|
48
|
-
struggling or says it cannot complete the task, even if phrased as a question
|
|
49
|
-
like "I have entered the right values into this form. Would you like me to do
|
|
50
|
-
anything else?" or "Here is the website. Is there any other information you
|
|
51
|
-
need?" or if the agent has strongly determined it wants to stop the task like
|
|
52
|
-
"The task is infeasible. Can I help you with something else?"
|
|
53
|
-
|
|
54
|
-
- CONTINUE: If the agent is asking for clarification before proceeding with a task
|
|
55
|
-
like "I'm about to clear cookies from this website. Would you like me to proceed?"
|
|
56
|
-
or "I've entered the right values into this form. Would you like me to continue
|
|
57
|
-
with the rest of the task?"
|
|
58
|
-
|
|
59
|
-
Respond ONLY with one of these two options.
|
|
60
|
-
"""
|
|
61
|
+
self.client: AsyncOpenAI = AsyncOpenAI(
|
|
62
|
+
base_url=settings.hud_gateway_url,
|
|
63
|
+
api_key=api_key,
|
|
61
64
|
)
|
|
65
|
+
self.model = model
|
|
66
|
+
self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
|
|
62
67
|
|
|
68
|
+
@instrument(
|
|
69
|
+
category="agent",
|
|
70
|
+
name="response_agent",
|
|
71
|
+
internal_type="user-message",
|
|
72
|
+
)
|
|
63
73
|
async def determine_response(self, agent_message: str) -> ResponseType:
|
|
64
74
|
"""
|
|
65
75
|
Determine whether the agent should stop or continue based on its message.
|
|
@@ -80,8 +90,9 @@ class ResponseAgent:
|
|
|
80
90
|
"content": f"Agent message: {agent_message}\n\nWhat is the appropriate response?", # noqa: E501
|
|
81
91
|
},
|
|
82
92
|
],
|
|
83
|
-
temperature=0.1,
|
|
84
|
-
max_tokens=5,
|
|
93
|
+
temperature=0.1,
|
|
94
|
+
max_tokens=5,
|
|
95
|
+
extra_headers={"Trace-Id": ""},
|
|
85
96
|
)
|
|
86
97
|
|
|
87
98
|
response_text = response.choices[0].message.content
|
|
@@ -96,5 +107,6 @@ class ResponseAgent:
|
|
|
96
107
|
else:
|
|
97
108
|
return "CONTINUE"
|
|
98
109
|
|
|
99
|
-
except Exception:
|
|
110
|
+
except Exception as e:
|
|
111
|
+
logger.warning("Auto-respond failed: %s", e)
|
|
100
112
|
return "CONTINUE" # Default to continue on error
|