hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""OpenAI Chat Completions Agent.
|
|
2
2
|
|
|
3
3
|
This class provides the minimal glue required to connect any endpoint that
|
|
4
4
|
implements the OpenAI compatible *chat.completions* API with MCP tool calling
|
|
@@ -6,6 +6,7 @@ through the existing :class:`hud.agent.MCPAgent` scaffolding.
|
|
|
6
6
|
|
|
7
7
|
Key points:
|
|
8
8
|
- Stateless, no special server-side conversation state is assumed.
|
|
9
|
+
- Defaults to HUD inference gateway (inference.hud.ai) when HUD_API_KEY is set
|
|
9
10
|
- Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
|
|
10
11
|
base_url / api_key (e.g. llama.cpp, together.ai, …)
|
|
11
12
|
- All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
|
|
@@ -20,39 +21,69 @@ import logging
|
|
|
20
21
|
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
22
|
|
|
22
23
|
import mcp.types as types
|
|
24
|
+
from openai import AsyncOpenAI
|
|
23
25
|
|
|
24
|
-
from hud import
|
|
25
|
-
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
26
|
+
from hud.settings import settings
|
|
27
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
26
28
|
from hud.utils.hud_console import HUDConsole
|
|
29
|
+
from hud.utils.types import with_signature
|
|
27
30
|
|
|
28
31
|
from .base import MCPAgent
|
|
32
|
+
from .types import OpenAIChatConfig, OpenAIChatCreateParams
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
|
-
from openai import AsyncOpenAI
|
|
32
35
|
from openai.types.chat import ChatCompletionToolParam
|
|
33
36
|
|
|
37
|
+
|
|
34
38
|
logger = logging.getLogger(__name__)
|
|
35
39
|
|
|
36
40
|
|
|
37
|
-
class
|
|
41
|
+
class OpenAIChatAgent(MCPAgent):
|
|
38
42
|
"""MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
|
|
39
43
|
|
|
40
|
-
metadata: ClassVar[dict[str, Any]] =
|
|
44
|
+
metadata: ClassVar[dict[str, Any] | None] = None
|
|
45
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = OpenAIChatConfig
|
|
46
|
+
|
|
47
|
+
@with_signature(OpenAIChatCreateParams)
|
|
48
|
+
@classmethod
|
|
49
|
+
def create(cls, **kwargs: Any) -> OpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
50
|
+
return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
|
|
51
|
+
|
|
52
|
+
def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) -> None:
|
|
53
|
+
super().__init__(params, **kwargs)
|
|
54
|
+
self.config: OpenAIChatConfig
|
|
55
|
+
|
|
56
|
+
if (
|
|
57
|
+
self.config.api_key
|
|
58
|
+
and self.config.base_url
|
|
59
|
+
and settings.hud_gateway_url in self.config.base_url
|
|
60
|
+
and settings.api_key
|
|
61
|
+
and self.config.api_key != settings.api_key
|
|
62
|
+
):
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"OpenAIChatAgent api_key is not allowed with HUD Gateway. "
|
|
65
|
+
"Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
|
|
66
|
+
)
|
|
41
67
|
|
|
42
|
-
|
|
43
|
-
self
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
68
|
+
self.oai: AsyncOpenAI
|
|
69
|
+
if self.config.openai_client is not None:
|
|
70
|
+
self.oai = self.config.openai_client
|
|
71
|
+
elif self.config.api_key is not None or self.config.base_url is not None:
|
|
72
|
+
self.oai = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
|
|
73
|
+
elif settings.api_key:
|
|
74
|
+
# Default to HUD inference gateway
|
|
75
|
+
self.oai = AsyncOpenAI(
|
|
76
|
+
api_key=settings.api_key,
|
|
77
|
+
base_url=settings.hud_gateway_url,
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"No API key found. Set HUD_API_KEY for HUD gateway, "
|
|
82
|
+
"or provide api_key/base_url/openai_client explicitly."
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
self.completion_kwargs = dict(self.config.completion_kwargs)
|
|
86
|
+
self.mcp_schemas: list[ChatCompletionToolParam] = []
|
|
56
87
|
self.hud_console = HUDConsole(logger=logger)
|
|
57
88
|
|
|
58
89
|
@staticmethod
|
|
@@ -69,11 +100,14 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
69
100
|
arguments=args,
|
|
70
101
|
)
|
|
71
102
|
|
|
72
|
-
async def get_system_messages(self) -> list[Any]:
|
|
103
|
+
async def get_system_messages(self) -> list[dict[str, Any]]:
|
|
73
104
|
"""Get system messages for OpenAI."""
|
|
74
|
-
|
|
105
|
+
if self.system_prompt is not None:
|
|
106
|
+
return [{"role": "system", "content": self.system_prompt}]
|
|
107
|
+
else:
|
|
108
|
+
return []
|
|
75
109
|
|
|
76
|
-
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
110
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
|
|
77
111
|
"""Format blocks for OpenAI."""
|
|
78
112
|
content = []
|
|
79
113
|
for block in blocks:
|
|
@@ -179,21 +213,16 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
179
213
|
extra: dict[str, Any],
|
|
180
214
|
) -> Any:
|
|
181
215
|
if self.oai is None:
|
|
182
|
-
raise ValueError("openai_client is required for
|
|
216
|
+
raise ValueError("openai_client is required for OpenAIChatAgent")
|
|
183
217
|
# default transport = OpenAI SDK
|
|
184
218
|
return await self.oai.chat.completions.create(
|
|
185
|
-
model=self.
|
|
219
|
+
model=self.config.model,
|
|
186
220
|
messages=messages,
|
|
187
221
|
tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
|
|
188
222
|
**extra,
|
|
189
223
|
) # type: ignore
|
|
190
224
|
|
|
191
|
-
|
|
192
|
-
span_type="agent",
|
|
193
|
-
record_args=False,
|
|
194
|
-
record_result=True,
|
|
195
|
-
)
|
|
196
|
-
async def get_response(self, messages: list[Any]) -> AgentResponse:
|
|
225
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
197
226
|
"""Send chat request to OpenAI and convert the response."""
|
|
198
227
|
|
|
199
228
|
# Convert MCP tool schemas to OpenAI format
|
|
@@ -256,16 +285,17 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
256
285
|
|
|
257
286
|
return AgentResponse(
|
|
258
287
|
content=msg.content or "",
|
|
288
|
+
reasoning=getattr(msg, "reasoning_content", None),
|
|
259
289
|
tool_calls=tool_calls,
|
|
260
290
|
done=done,
|
|
261
|
-
raw=response,
|
|
291
|
+
raw=response,
|
|
262
292
|
)
|
|
263
293
|
|
|
264
294
|
async def format_tool_results(
|
|
265
295
|
self,
|
|
266
296
|
tool_calls: list[MCPToolCall],
|
|
267
297
|
tool_results: list[MCPToolResult],
|
|
268
|
-
) -> list[Any]:
|
|
298
|
+
) -> list[dict[str, Any]]:
|
|
269
299
|
"""Render MCP tool results as OpenAI messages.
|
|
270
300
|
|
|
271
301
|
Note: OpenAI tool messages only support string content.
|
hud/agents/operator.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Operator agent built on top of OpenAIAgent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
6
|
+
|
|
7
|
+
import mcp.types as types
|
|
8
|
+
from openai.types.responses import (
|
|
9
|
+
ApplyPatchToolParam,
|
|
10
|
+
ComputerToolParam,
|
|
11
|
+
FunctionShellToolParam,
|
|
12
|
+
FunctionToolParam,
|
|
13
|
+
ResponseComputerToolCallOutputScreenshotParam,
|
|
14
|
+
)
|
|
15
|
+
from openai.types.responses.response_input_param import (
|
|
16
|
+
ComputerCallOutput,
|
|
17
|
+
FunctionCallOutput,
|
|
18
|
+
)
|
|
19
|
+
from openai.types.shared_params.reasoning import Reasoning
|
|
20
|
+
|
|
21
|
+
from hud.tools.computer.settings import computer_settings
|
|
22
|
+
from hud.types import BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
23
|
+
from hud.utils.types import with_signature
|
|
24
|
+
|
|
25
|
+
from .base import MCPAgent
|
|
26
|
+
from .openai import OpenAIAgent
|
|
27
|
+
from .types import OperatorConfig, OperatorCreateParams
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
|
|
31
|
+
|
|
32
|
+
OPERATOR_INSTRUCTIONS = """
|
|
33
|
+
You are an autonomous computer-using agent. Follow these guidelines:
|
|
34
|
+
|
|
35
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
36
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to
|
|
37
|
+
continue?" - just proceed.
|
|
38
|
+
3. When the user asks you to interact with something (like clicking a chat or typing
|
|
39
|
+
a message), DO IT without asking.
|
|
40
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like
|
|
41
|
+
deleting important files).
|
|
42
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms -
|
|
43
|
+
JUST DO IT.
|
|
44
|
+
6. The user has already given you permission by running this agent. No further
|
|
45
|
+
confirmation is needed.
|
|
46
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
47
|
+
|
|
48
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do
|
|
49
|
+
what they asked.
|
|
50
|
+
""".strip()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class OperatorAgent(OpenAIAgent):
|
|
54
|
+
"""
|
|
55
|
+
Backwards-compatible Operator agent built on top of OpenAIAgent.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
metadata: ClassVar[dict[str, Any] | None] = {
|
|
59
|
+
"display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
60
|
+
"display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
61
|
+
}
|
|
62
|
+
# base class will ensure that the computer tool is available
|
|
63
|
+
required_tools: ClassVar[list[str]] = ["openai_computer"]
|
|
64
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = OperatorConfig
|
|
65
|
+
|
|
66
|
+
@with_signature(OperatorCreateParams)
|
|
67
|
+
@classmethod
|
|
68
|
+
def create(cls, **kwargs: Any) -> OperatorAgent: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
69
|
+
return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
|
|
70
|
+
|
|
71
|
+
def __init__(self, params: OperatorCreateParams | None = None, **kwargs: Any) -> None:
|
|
72
|
+
super().__init__(params, **kwargs) # type: ignore[arg-type]
|
|
73
|
+
self.config: OperatorConfig # type: ignore[assignment]
|
|
74
|
+
|
|
75
|
+
self._operator_computer_tool_name = "openai_computer"
|
|
76
|
+
self._operator_display_width = computer_settings.OPENAI_COMPUTER_WIDTH
|
|
77
|
+
self._operator_display_height = computer_settings.OPENAI_COMPUTER_HEIGHT
|
|
78
|
+
self._operator_environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = (
|
|
79
|
+
self.config.environment
|
|
80
|
+
)
|
|
81
|
+
self.environment = self.config.environment
|
|
82
|
+
|
|
83
|
+
# add pending call id and safety checks to the agent
|
|
84
|
+
self.pending_call_id: str | None = None
|
|
85
|
+
self.pending_safety_checks: list[PendingSafetyCheck] = []
|
|
86
|
+
|
|
87
|
+
# override reasoning to "summary": "auto"
|
|
88
|
+
if self.reasoning is None:
|
|
89
|
+
self.reasoning = Reasoning(summary="auto")
|
|
90
|
+
else:
|
|
91
|
+
self.reasoning["summary"] = "auto"
|
|
92
|
+
|
|
93
|
+
# override truncation to "auto"
|
|
94
|
+
self.truncation = "auto"
|
|
95
|
+
|
|
96
|
+
if self.system_prompt:
|
|
97
|
+
self.system_prompt = f"{self.system_prompt}\n\n{OPERATOR_INSTRUCTIONS}"
|
|
98
|
+
else:
|
|
99
|
+
self.system_prompt = OPERATOR_INSTRUCTIONS
|
|
100
|
+
|
|
101
|
+
def _reset_response_state(self) -> None:
|
|
102
|
+
super()._reset_response_state()
|
|
103
|
+
self.pending_call_id = None
|
|
104
|
+
self.pending_safety_checks = []
|
|
105
|
+
|
|
106
|
+
def _to_openai_tool(
|
|
107
|
+
self, tool: types.Tool
|
|
108
|
+
) -> (
|
|
109
|
+
FunctionShellToolParam | ApplyPatchToolParam | FunctionToolParam | ComputerToolParam | None
|
|
110
|
+
):
|
|
111
|
+
if tool.name == self._operator_computer_tool_name:
|
|
112
|
+
return ComputerToolParam(
|
|
113
|
+
type="computer_use_preview",
|
|
114
|
+
display_width=self._operator_display_width,
|
|
115
|
+
display_height=self._operator_display_height,
|
|
116
|
+
environment=self._operator_environment,
|
|
117
|
+
)
|
|
118
|
+
if tool.name == "computer" or tool.name.endswith("_computer"):
|
|
119
|
+
return None
|
|
120
|
+
return super()._to_openai_tool(tool)
|
|
121
|
+
|
|
122
|
+
def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
|
|
123
|
+
"""Route computer_call to the OpenAI-specific computer tool."""
|
|
124
|
+
if item.type == "computer_call":
|
|
125
|
+
self.pending_safety_checks = item.pending_safety_checks
|
|
126
|
+
return MCPToolCall(
|
|
127
|
+
name=self._operator_computer_tool_name,
|
|
128
|
+
arguments=item.action.to_dict(),
|
|
129
|
+
id=item.call_id,
|
|
130
|
+
)
|
|
131
|
+
return super()._extract_tool_call(item)
|
|
132
|
+
|
|
133
|
+
async def format_tool_results(
|
|
134
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
135
|
+
) -> list[ComputerCallOutput | FunctionCallOutput]:
|
|
136
|
+
remaining_calls: list[MCPToolCall] = []
|
|
137
|
+
remaining_results: list[MCPToolResult] = []
|
|
138
|
+
computer_outputs: list[ComputerCallOutput] = []
|
|
139
|
+
ordering: list[tuple[str, int]] = []
|
|
140
|
+
|
|
141
|
+
for call, result in zip(tool_calls, tool_results, strict=False):
|
|
142
|
+
if call.name == self._operator_computer_tool_name:
|
|
143
|
+
screenshot = self._extract_latest_screenshot(result)
|
|
144
|
+
if not screenshot:
|
|
145
|
+
self.console.warning_log(
|
|
146
|
+
"Computer tool result missing screenshot; skipping output."
|
|
147
|
+
)
|
|
148
|
+
continue
|
|
149
|
+
call_id = call.id or self.pending_call_id
|
|
150
|
+
if not call_id:
|
|
151
|
+
self.console.warning_log("Computer tool call missing ID; skipping output.")
|
|
152
|
+
continue
|
|
153
|
+
acknowledged_checks = []
|
|
154
|
+
for check in self.pending_safety_checks:
|
|
155
|
+
if hasattr(check, "model_dump"):
|
|
156
|
+
acknowledged_checks.append(check.model_dump())
|
|
157
|
+
elif isinstance(check, dict):
|
|
158
|
+
acknowledged_checks.append(check)
|
|
159
|
+
output_payload = ComputerCallOutput(
|
|
160
|
+
type="computer_call_output",
|
|
161
|
+
call_id=call_id,
|
|
162
|
+
output=ResponseComputerToolCallOutputScreenshotParam(
|
|
163
|
+
type="computer_screenshot",
|
|
164
|
+
image_url=f"data:image/png;base64,{screenshot}",
|
|
165
|
+
),
|
|
166
|
+
acknowledged_safety_checks=acknowledged_checks if acknowledged_checks else None,
|
|
167
|
+
)
|
|
168
|
+
computer_outputs.append(output_payload)
|
|
169
|
+
self.pending_call_id = None
|
|
170
|
+
self.pending_safety_checks = []
|
|
171
|
+
ordering.append(("computer", len(computer_outputs) - 1))
|
|
172
|
+
else:
|
|
173
|
+
remaining_calls.append(call)
|
|
174
|
+
remaining_results.append(result)
|
|
175
|
+
ordering.append(("function", len(remaining_calls) - 1))
|
|
176
|
+
|
|
177
|
+
formatted: list[ComputerCallOutput | FunctionCallOutput] = []
|
|
178
|
+
function_outputs: list[FunctionCallOutput] = []
|
|
179
|
+
if remaining_calls:
|
|
180
|
+
function_outputs = await super().format_tool_results(remaining_calls, remaining_results)
|
|
181
|
+
|
|
182
|
+
for kind, idx in ordering:
|
|
183
|
+
if kind == "computer":
|
|
184
|
+
if idx < len(computer_outputs):
|
|
185
|
+
formatted.append(computer_outputs[idx])
|
|
186
|
+
else:
|
|
187
|
+
if idx < len(function_outputs):
|
|
188
|
+
formatted.append(function_outputs[idx])
|
|
189
|
+
return formatted
|
|
190
|
+
|
|
191
|
+
def _extract_latest_screenshot(self, result: MCPToolResult) -> str | None:
|
|
192
|
+
if not result.content:
|
|
193
|
+
return None
|
|
194
|
+
for content in reversed(result.content):
|
|
195
|
+
if isinstance(content, types.ImageContent):
|
|
196
|
+
return content.data
|
|
197
|
+
if isinstance(content, types.TextContent) and result.isError:
|
|
198
|
+
self.console.error_log(f"Computer tool error: {content.text}")
|
|
199
|
+
return None
|
hud/agents/resolver.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Model resolution - maps model strings to agent classes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from hud.agents.base import MCPAgent
|
|
9
|
+
|
|
10
|
+
__all__ = ["resolve_cls"]
|
|
11
|
+
|
|
12
|
+
_models_cache: list[dict[str, Any]] | None = None
|
|
13
|
+
|
|
14
|
+
# Provider name → AgentType value (only anthropic differs)
|
|
15
|
+
_PROVIDER_TO_AGENT = {"anthropic": "claude"}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _fetch_gateway_models() -> list[dict[str, Any]]:
|
|
19
|
+
"""Fetch available models from HUD gateway (cached)."""
|
|
20
|
+
global _models_cache
|
|
21
|
+
if _models_cache is not None:
|
|
22
|
+
return _models_cache
|
|
23
|
+
|
|
24
|
+
import httpx
|
|
25
|
+
|
|
26
|
+
from hud.settings import settings
|
|
27
|
+
|
|
28
|
+
if not settings.api_key:
|
|
29
|
+
return []
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
resp = httpx.get(
|
|
33
|
+
f"{settings.hud_gateway_url}/models",
|
|
34
|
+
headers={"Authorization": f"Bearer {settings.api_key}"},
|
|
35
|
+
timeout=10.0,
|
|
36
|
+
)
|
|
37
|
+
resp.raise_for_status()
|
|
38
|
+
data = resp.json()
|
|
39
|
+
_models_cache = data.get("data", data) if isinstance(data, dict) else data
|
|
40
|
+
return _models_cache or []
|
|
41
|
+
except Exception:
|
|
42
|
+
return []
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
|
|
46
|
+
"""Resolve model string to (agent_class, gateway_info).
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
(agent_class, None) for known AgentTypes
|
|
50
|
+
(agent_class, gateway_model_info) for gateway models
|
|
51
|
+
"""
|
|
52
|
+
from hud.types import AgentType
|
|
53
|
+
|
|
54
|
+
# Known AgentType → no gateway info
|
|
55
|
+
try:
|
|
56
|
+
return AgentType(model).cls, None
|
|
57
|
+
except ValueError:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
# Gateway lookup
|
|
61
|
+
for m in _fetch_gateway_models():
|
|
62
|
+
if model in (m.get("id"), m.get("name"), m.get("model")):
|
|
63
|
+
provider = (m.get("provider") or "openai_compatible").lower()
|
|
64
|
+
agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
|
|
65
|
+
try:
|
|
66
|
+
return AgentType(agent_str).cls, m
|
|
67
|
+
except ValueError:
|
|
68
|
+
return AgentType.OPENAI_COMPATIBLE.cls, m
|
|
69
|
+
|
|
70
|
+
raise ValueError(f"Model '{model}' not found")
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Shared test fixtures for agent tests."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from mcp import types
|
|
9
|
+
|
|
10
|
+
from hud.environment.router import ToolRouter
|
|
11
|
+
from hud.eval.context import EvalContext
|
|
12
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MockEvalContext(EvalContext):
|
|
16
|
+
"""Mock EvalContext for testing agents.
|
|
17
|
+
|
|
18
|
+
This provides a minimal EvalContext implementation that can be used
|
|
19
|
+
to test agent initialization and tool calling without a real environment.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
prompt: str = "Test prompt",
|
|
25
|
+
tools: list[types.Tool] | None = None,
|
|
26
|
+
call_tool_handler: Any = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
# Core attributes
|
|
29
|
+
self.prompt = prompt
|
|
30
|
+
self._tools = tools or []
|
|
31
|
+
self._submitted: str | None = None
|
|
32
|
+
self.reward: float | None = None
|
|
33
|
+
self._call_tool_handler = call_tool_handler
|
|
34
|
+
self.tool_calls: list[tuple[str, dict[str, Any]]] = []
|
|
35
|
+
|
|
36
|
+
# Environment attributes
|
|
37
|
+
self._router = ToolRouter()
|
|
38
|
+
self._agent_include: list[str] | None = None
|
|
39
|
+
self._agent_exclude: list[str] | None = None
|
|
40
|
+
|
|
41
|
+
# EvalContext attributes
|
|
42
|
+
self._task = None
|
|
43
|
+
self.trace_id = "test-trace-id"
|
|
44
|
+
self.eval_name = "test-eval"
|
|
45
|
+
self.job_id: str | None = None
|
|
46
|
+
self.group_id: str | None = None
|
|
47
|
+
self.index = 0
|
|
48
|
+
self.variants: dict[str, Any] = {}
|
|
49
|
+
self.answer: str | None = None
|
|
50
|
+
self.system_prompt: str | None = None
|
|
51
|
+
self.error: BaseException | None = None
|
|
52
|
+
self.metadata: dict[str, Any] = {}
|
|
53
|
+
self.results: list[Any] = []
|
|
54
|
+
self._is_summary = False
|
|
55
|
+
|
|
56
|
+
def as_tools(self) -> list[types.Tool]:
|
|
57
|
+
return self._tools
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def has_scenario(self) -> bool:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
64
|
+
return self._tools
|
|
65
|
+
|
|
66
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
67
|
+
# Parse the call
|
|
68
|
+
if isinstance(call, tuple):
|
|
69
|
+
name, args = call[0], call[1] if len(call) > 1 else {}
|
|
70
|
+
elif hasattr(call, "name"):
|
|
71
|
+
name, args = call.name, getattr(call, "arguments", {}) or {}
|
|
72
|
+
else:
|
|
73
|
+
name, args = str(call), kwargs
|
|
74
|
+
|
|
75
|
+
self.tool_calls.append((name, args))
|
|
76
|
+
|
|
77
|
+
if self._call_tool_handler:
|
|
78
|
+
tc = MCPToolCall(name=name, arguments=args)
|
|
79
|
+
return self._call_tool_handler(tc)
|
|
80
|
+
|
|
81
|
+
return MCPToolResult(
|
|
82
|
+
content=[types.TextContent(type="text", text=f"Result from {name}")],
|
|
83
|
+
isError=False,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
async def submit(self, answer: str) -> None:
|
|
87
|
+
self._submitted = answer
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.fixture
|
|
91
|
+
def mock_eval_context() -> MockEvalContext:
|
|
92
|
+
"""Create a basic mock EvalContext."""
|
|
93
|
+
return MockEvalContext()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@pytest.fixture
|
|
97
|
+
def mock_eval_context_with_tools() -> MockEvalContext:
|
|
98
|
+
"""Create a mock EvalContext with test tools."""
|
|
99
|
+
return MockEvalContext(
|
|
100
|
+
tools=[
|
|
101
|
+
types.Tool(
|
|
102
|
+
name="test_tool",
|
|
103
|
+
description="A test tool",
|
|
104
|
+
inputSchema={"type": "object", "properties": {}},
|
|
105
|
+
)
|
|
106
|
+
]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.fixture
|
|
111
|
+
def mock_eval_context_computer() -> MockEvalContext:
|
|
112
|
+
"""Create a mock EvalContext with computer tool."""
|
|
113
|
+
return MockEvalContext(
|
|
114
|
+
tools=[
|
|
115
|
+
types.Tool(
|
|
116
|
+
name="computer",
|
|
117
|
+
description="Computer use tool",
|
|
118
|
+
inputSchema={"type": "object"},
|
|
119
|
+
)
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.fixture
|
|
125
|
+
def mock_eval_context_browser_tools() -> MockEvalContext:
|
|
126
|
+
"""Create a mock EvalContext with browser-like tools."""
|
|
127
|
+
return MockEvalContext(
|
|
128
|
+
tools=[
|
|
129
|
+
types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
|
|
130
|
+
types.Tool(name="click", description="Click at coordinates", inputSchema={}),
|
|
131
|
+
types.Tool(name="type", description="Type text", inputSchema={}),
|
|
132
|
+
]
|
|
133
|
+
)
|