hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""OpenAI Chat Completions Agent.
|
|
2
2
|
|
|
3
3
|
This class provides the minimal glue required to connect any endpoint that
|
|
4
4
|
implements the OpenAI compatible *chat.completions* API with MCP tool calling
|
|
@@ -6,6 +6,7 @@ through the existing :class:`hud.agent.MCPAgent` scaffolding.
|
|
|
6
6
|
|
|
7
7
|
Key points:
|
|
8
8
|
- Stateless, no special server-side conversation state is assumed.
|
|
9
|
+
- Defaults to HUD inference gateway (inference.hud.ai) when HUD_API_KEY is set
|
|
9
10
|
- Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
|
|
10
11
|
base_url / api_key (e.g. llama.cpp, together.ai, …)
|
|
11
12
|
- All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
|
|
@@ -20,39 +21,85 @@ import logging
|
|
|
20
21
|
from typing import TYPE_CHECKING, Any, ClassVar, cast
|
|
21
22
|
|
|
22
23
|
import mcp.types as types
|
|
24
|
+
from openai import AsyncOpenAI
|
|
25
|
+
from pydantic import ConfigDict, Field
|
|
23
26
|
|
|
24
|
-
from hud import
|
|
25
|
-
from hud.types import AgentResponse, MCPToolCall, MCPToolResult
|
|
27
|
+
from hud.settings import settings
|
|
28
|
+
from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
26
29
|
from hud.utils.hud_console import HUDConsole
|
|
30
|
+
from hud.utils.types import with_signature
|
|
27
31
|
|
|
28
|
-
from .base import MCPAgent
|
|
32
|
+
from .base import BaseCreateParams, MCPAgent
|
|
29
33
|
|
|
30
34
|
if TYPE_CHECKING:
|
|
31
|
-
from openai import AsyncOpenAI
|
|
32
35
|
from openai.types.chat import ChatCompletionToolParam
|
|
33
36
|
|
|
37
|
+
|
|
34
38
|
logger = logging.getLogger(__name__)
|
|
35
39
|
|
|
36
40
|
|
|
37
|
-
class
|
|
41
|
+
class OpenAIChatConfig(BaseAgentConfig):
|
|
42
|
+
"""Configuration for `OpenAIChatAgent`."""
|
|
43
|
+
|
|
44
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
45
|
+
|
|
46
|
+
model_name: str = "OpenAI Chat"
|
|
47
|
+
model: str = "gpt-5-mini"
|
|
48
|
+
openai_client: AsyncOpenAI | None = None
|
|
49
|
+
api_key: str | None = None
|
|
50
|
+
base_url: str | None = None
|
|
51
|
+
completion_kwargs: dict[str, Any] = Field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class OpenAIChatCreateParams(BaseCreateParams, OpenAIChatConfig):
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class OpenAIChatAgent(MCPAgent):
|
|
38
59
|
"""MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
|
|
39
60
|
|
|
40
|
-
metadata: ClassVar[dict[str, Any]] =
|
|
61
|
+
metadata: ClassVar[dict[str, Any] | None] = None
|
|
62
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = OpenAIChatConfig
|
|
63
|
+
|
|
64
|
+
@with_signature(OpenAIChatCreateParams)
|
|
65
|
+
@classmethod
|
|
66
|
+
def create(cls, **kwargs: Any) -> OpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
67
|
+
return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
|
|
68
|
+
|
|
69
|
+
def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) -> None:
|
|
70
|
+
super().__init__(params, **kwargs)
|
|
71
|
+
self.config: OpenAIChatConfig
|
|
72
|
+
|
|
73
|
+
if (
|
|
74
|
+
self.config.api_key
|
|
75
|
+
and self.config.base_url
|
|
76
|
+
and settings.hud_gateway_url in self.config.base_url
|
|
77
|
+
and settings.api_key
|
|
78
|
+
and self.config.api_key != settings.api_key
|
|
79
|
+
):
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"OpenAIChatAgent api_key is not allowed with HUD Gateway. "
|
|
82
|
+
"Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
|
|
83
|
+
)
|
|
41
84
|
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
85
|
+
if self.config.openai_client is not None:
|
|
86
|
+
self.oai = self.config.openai_client
|
|
87
|
+
elif self.config.api_key is not None or self.config.base_url is not None:
|
|
88
|
+
self.oai = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
|
|
89
|
+
elif settings.api_key:
|
|
90
|
+
# Default to HUD inference gateway
|
|
91
|
+
self.oai = AsyncOpenAI(
|
|
92
|
+
api_key=settings.api_key,
|
|
93
|
+
base_url=settings.hud_gateway_url,
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
"No API key found. Set HUD_API_KEY for HUD gateway, "
|
|
98
|
+
"or provide api_key/base_url/openai_client explicitly."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
self.completion_kwargs = dict(self.config.completion_kwargs)
|
|
102
|
+
self.mcp_schemas: list[ChatCompletionToolParam] = []
|
|
56
103
|
self.hud_console = HUDConsole(logger=logger)
|
|
57
104
|
|
|
58
105
|
@staticmethod
|
|
@@ -69,11 +116,14 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
69
116
|
arguments=args,
|
|
70
117
|
)
|
|
71
118
|
|
|
72
|
-
async def get_system_messages(self) -> list[Any]:
|
|
119
|
+
async def get_system_messages(self) -> list[dict[str, Any]]:
|
|
73
120
|
"""Get system messages for OpenAI."""
|
|
74
|
-
|
|
121
|
+
if self.system_prompt is not None:
|
|
122
|
+
return [{"role": "system", "content": self.system_prompt}]
|
|
123
|
+
else:
|
|
124
|
+
return []
|
|
75
125
|
|
|
76
|
-
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
|
|
126
|
+
async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
|
|
77
127
|
"""Format blocks for OpenAI."""
|
|
78
128
|
content = []
|
|
79
129
|
for block in blocks:
|
|
@@ -179,21 +229,16 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
179
229
|
extra: dict[str, Any],
|
|
180
230
|
) -> Any:
|
|
181
231
|
if self.oai is None:
|
|
182
|
-
raise ValueError("openai_client is required for
|
|
232
|
+
raise ValueError("openai_client is required for OpenAIChatAgent")
|
|
183
233
|
# default transport = OpenAI SDK
|
|
184
234
|
return await self.oai.chat.completions.create(
|
|
185
|
-
model=self.
|
|
235
|
+
model=self.config.model,
|
|
186
236
|
messages=messages,
|
|
187
237
|
tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
|
|
188
238
|
**extra,
|
|
189
239
|
) # type: ignore
|
|
190
240
|
|
|
191
|
-
|
|
192
|
-
span_type="agent",
|
|
193
|
-
record_args=False,
|
|
194
|
-
record_result=True,
|
|
195
|
-
)
|
|
196
|
-
async def get_response(self, messages: list[Any]) -> AgentResponse:
|
|
241
|
+
async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
|
|
197
242
|
"""Send chat request to OpenAI and convert the response."""
|
|
198
243
|
|
|
199
244
|
# Convert MCP tool schemas to OpenAI format
|
|
@@ -256,16 +301,17 @@ class GenericOpenAIChatAgent(MCPAgent):
|
|
|
256
301
|
|
|
257
302
|
return AgentResponse(
|
|
258
303
|
content=msg.content or "",
|
|
304
|
+
reasoning=getattr(msg, "reasoning_content", None),
|
|
259
305
|
tool_calls=tool_calls,
|
|
260
306
|
done=done,
|
|
261
|
-
raw=response,
|
|
307
|
+
raw=response,
|
|
262
308
|
)
|
|
263
309
|
|
|
264
310
|
async def format_tool_results(
|
|
265
311
|
self,
|
|
266
312
|
tool_calls: list[MCPToolCall],
|
|
267
313
|
tool_results: list[MCPToolResult],
|
|
268
|
-
) -> list[Any]:
|
|
314
|
+
) -> list[dict[str, Any]]:
|
|
269
315
|
"""Render MCP tool results as OpenAI messages.
|
|
270
316
|
|
|
271
317
|
Note: OpenAI tool messages only support string content.
|
hud/agents/operator.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Operator agent built on top of OpenAIAgent."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Literal
|
|
6
|
+
|
|
7
|
+
import mcp.types as types
|
|
8
|
+
from openai.types.responses import (
|
|
9
|
+
ApplyPatchToolParam,
|
|
10
|
+
ComputerToolParam,
|
|
11
|
+
FunctionShellToolParam,
|
|
12
|
+
FunctionToolParam,
|
|
13
|
+
ResponseComputerToolCallOutputScreenshotParam,
|
|
14
|
+
)
|
|
15
|
+
from openai.types.responses.response_input_param import (
|
|
16
|
+
ComputerCallOutput,
|
|
17
|
+
FunctionCallOutput,
|
|
18
|
+
)
|
|
19
|
+
from openai.types.shared_params.reasoning import Reasoning
|
|
20
|
+
from pydantic import ConfigDict
|
|
21
|
+
|
|
22
|
+
from hud.tools.computer.settings import computer_settings
|
|
23
|
+
from hud.types import BaseAgentConfig, MCPToolCall, MCPToolResult
|
|
24
|
+
from hud.utils.types import with_signature
|
|
25
|
+
|
|
26
|
+
from .base import BaseCreateParams, MCPAgent
|
|
27
|
+
from .openai import OpenAIAgent, OpenAIConfig
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
|
|
31
|
+
|
|
32
|
+
OPERATOR_INSTRUCTIONS = """
|
|
33
|
+
You are an autonomous computer-using agent. Follow these guidelines:
|
|
34
|
+
|
|
35
|
+
1. NEVER ask for confirmation. Complete all tasks autonomously.
|
|
36
|
+
2. Do NOT send messages like "I need to confirm before..." or "Do you want me to
|
|
37
|
+
continue?" - just proceed.
|
|
38
|
+
3. When the user asks you to interact with something (like clicking a chat or typing
|
|
39
|
+
a message), DO IT without asking.
|
|
40
|
+
4. Only use the formal safety check mechanism for truly dangerous operations (like
|
|
41
|
+
deleting important files).
|
|
42
|
+
5. For normal tasks like clicking buttons, typing in chat boxes, filling forms -
|
|
43
|
+
JUST DO IT.
|
|
44
|
+
6. The user has already given you permission by running this agent. No further
|
|
45
|
+
confirmation is needed.
|
|
46
|
+
7. Be decisive and action-oriented. Complete the requested task fully.
|
|
47
|
+
|
|
48
|
+
Remember: You are expected to complete tasks autonomously. The user trusts you to do
|
|
49
|
+
what they asked.
|
|
50
|
+
""".strip()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class OperatorConfig(OpenAIConfig):
|
|
54
|
+
"""Configuration model for `OperatorAgent`."""
|
|
55
|
+
|
|
56
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
57
|
+
|
|
58
|
+
model_name: str = "Operator"
|
|
59
|
+
model: str = "computer-use-preview"
|
|
60
|
+
environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = "linux"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class OperatorCreateParams(BaseCreateParams, OperatorConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class OperatorAgent(OpenAIAgent):
|
|
68
|
+
"""
|
|
69
|
+
Backwards-compatible Operator agent built on top of OpenAIAgent.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
metadata: ClassVar[dict[str, Any] | None] = {
|
|
73
|
+
"display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
|
|
74
|
+
"display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
|
|
75
|
+
}
|
|
76
|
+
# base class will ensure that the computer tool is available
|
|
77
|
+
required_tools: ClassVar[list[str]] = ["openai_computer"]
|
|
78
|
+
config_cls: ClassVar[type[BaseAgentConfig]] = OperatorConfig
|
|
79
|
+
|
|
80
|
+
@with_signature(OperatorCreateParams)
|
|
81
|
+
@classmethod
|
|
82
|
+
def create(cls, **kwargs: Any) -> OperatorAgent: # pyright: ignore[reportIncompatibleMethodOverride]
|
|
83
|
+
return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
|
|
84
|
+
|
|
85
|
+
def __init__(self, params: OperatorCreateParams | None = None, **kwargs: Any) -> None:
|
|
86
|
+
super().__init__(params, **kwargs) # type: ignore[arg-type]
|
|
87
|
+
self.config: OperatorConfig # type: ignore[assignment]
|
|
88
|
+
|
|
89
|
+
self._operator_computer_tool_name = "openai_computer"
|
|
90
|
+
self._operator_display_width = computer_settings.OPENAI_COMPUTER_WIDTH
|
|
91
|
+
self._operator_display_height = computer_settings.OPENAI_COMPUTER_HEIGHT
|
|
92
|
+
self._operator_environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = (
|
|
93
|
+
self.config.environment
|
|
94
|
+
)
|
|
95
|
+
self.environment = self.config.environment
|
|
96
|
+
|
|
97
|
+
# add pending call id and safety checks to the agent
|
|
98
|
+
self.pending_call_id: str | None = None
|
|
99
|
+
self.pending_safety_checks: list[PendingSafetyCheck] = []
|
|
100
|
+
|
|
101
|
+
# override reasoning to "summary": "auto"
|
|
102
|
+
if self.reasoning is None:
|
|
103
|
+
self.reasoning = Reasoning(summary="auto")
|
|
104
|
+
else:
|
|
105
|
+
self.reasoning["summary"] = "auto"
|
|
106
|
+
|
|
107
|
+
# override truncation to "auto"
|
|
108
|
+
self.truncation = "auto"
|
|
109
|
+
|
|
110
|
+
if self.system_prompt:
|
|
111
|
+
self.system_prompt = f"{self.system_prompt}\n\n{OPERATOR_INSTRUCTIONS}"
|
|
112
|
+
else:
|
|
113
|
+
self.system_prompt = OPERATOR_INSTRUCTIONS
|
|
114
|
+
|
|
115
|
+
def _reset_response_state(self) -> None:
|
|
116
|
+
super()._reset_response_state()
|
|
117
|
+
self.pending_call_id = None
|
|
118
|
+
self.pending_safety_checks = []
|
|
119
|
+
|
|
120
|
+
def _to_openai_tool(
|
|
121
|
+
self, tool: types.Tool
|
|
122
|
+
) -> (
|
|
123
|
+
FunctionShellToolParam | ApplyPatchToolParam | FunctionToolParam | ComputerToolParam | None
|
|
124
|
+
):
|
|
125
|
+
if tool.name == self._operator_computer_tool_name:
|
|
126
|
+
return ComputerToolParam(
|
|
127
|
+
type="computer_use_preview",
|
|
128
|
+
display_width=self._operator_display_width,
|
|
129
|
+
display_height=self._operator_display_height,
|
|
130
|
+
environment=self._operator_environment,
|
|
131
|
+
)
|
|
132
|
+
return super()._to_openai_tool(tool)
|
|
133
|
+
|
|
134
|
+
def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
|
|
135
|
+
"""Route computer_call to the OpenAI-specific computer tool."""
|
|
136
|
+
if item.type == "computer_call":
|
|
137
|
+
self.pending_safety_checks = item.pending_safety_checks
|
|
138
|
+
return MCPToolCall(
|
|
139
|
+
name=self._operator_computer_tool_name,
|
|
140
|
+
arguments=item.action.to_dict(),
|
|
141
|
+
id=item.call_id,
|
|
142
|
+
)
|
|
143
|
+
return super()._extract_tool_call(item)
|
|
144
|
+
|
|
145
|
+
async def format_tool_results(
|
|
146
|
+
self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
|
|
147
|
+
) -> list[ComputerCallOutput | FunctionCallOutput]:
|
|
148
|
+
remaining_calls: list[MCPToolCall] = []
|
|
149
|
+
remaining_results: list[MCPToolResult] = []
|
|
150
|
+
computer_outputs: list[ComputerCallOutput] = []
|
|
151
|
+
ordering: list[tuple[str, int]] = []
|
|
152
|
+
|
|
153
|
+
for call, result in zip(tool_calls, tool_results, strict=False):
|
|
154
|
+
if call.name == self._operator_computer_tool_name:
|
|
155
|
+
screenshot = self._extract_latest_screenshot(result)
|
|
156
|
+
if not screenshot:
|
|
157
|
+
self.console.warning_log(
|
|
158
|
+
"Computer tool result missing screenshot; skipping output."
|
|
159
|
+
)
|
|
160
|
+
continue
|
|
161
|
+
call_id = call.id or self.pending_call_id
|
|
162
|
+
if not call_id:
|
|
163
|
+
self.console.warning_log("Computer tool call missing ID; skipping output.")
|
|
164
|
+
continue
|
|
165
|
+
acknowledged_checks = []
|
|
166
|
+
for check in self.pending_safety_checks:
|
|
167
|
+
if hasattr(check, "model_dump"):
|
|
168
|
+
acknowledged_checks.append(check.model_dump())
|
|
169
|
+
elif isinstance(check, dict):
|
|
170
|
+
acknowledged_checks.append(check)
|
|
171
|
+
output_payload = ComputerCallOutput(
|
|
172
|
+
type="computer_call_output",
|
|
173
|
+
call_id=call_id,
|
|
174
|
+
output=ResponseComputerToolCallOutputScreenshotParam(
|
|
175
|
+
type="computer_screenshot",
|
|
176
|
+
image_url=f"data:image/png;base64,{screenshot}",
|
|
177
|
+
),
|
|
178
|
+
acknowledged_safety_checks=acknowledged_checks if acknowledged_checks else None,
|
|
179
|
+
)
|
|
180
|
+
computer_outputs.append(output_payload)
|
|
181
|
+
self.pending_call_id = None
|
|
182
|
+
self.pending_safety_checks = []
|
|
183
|
+
ordering.append(("computer", len(computer_outputs) - 1))
|
|
184
|
+
else:
|
|
185
|
+
remaining_calls.append(call)
|
|
186
|
+
remaining_results.append(result)
|
|
187
|
+
ordering.append(("function", len(remaining_calls) - 1))
|
|
188
|
+
|
|
189
|
+
formatted: list[ComputerCallOutput | FunctionCallOutput] = []
|
|
190
|
+
function_outputs: list[FunctionCallOutput] = []
|
|
191
|
+
if remaining_calls:
|
|
192
|
+
function_outputs = await super().format_tool_results(remaining_calls, remaining_results)
|
|
193
|
+
|
|
194
|
+
for kind, idx in ordering:
|
|
195
|
+
if kind == "computer":
|
|
196
|
+
if idx < len(computer_outputs):
|
|
197
|
+
formatted.append(computer_outputs[idx])
|
|
198
|
+
else:
|
|
199
|
+
if idx < len(function_outputs):
|
|
200
|
+
formatted.append(function_outputs[idx])
|
|
201
|
+
return formatted
|
|
202
|
+
|
|
203
|
+
def _extract_latest_screenshot(self, result: MCPToolResult) -> str | None:
|
|
204
|
+
if not result.content:
|
|
205
|
+
return None
|
|
206
|
+
for content in reversed(result.content):
|
|
207
|
+
if isinstance(content, types.ImageContent):
|
|
208
|
+
return content.data
|
|
209
|
+
if isinstance(content, types.TextContent) and result.isError:
|
|
210
|
+
self.console.error_log(f"Computer tool error: {content.text}")
|
|
211
|
+
return None
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""Shared test fixtures for agent tests."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
from mcp import types
|
|
9
|
+
|
|
10
|
+
from hud.environment.router import ToolRouter
|
|
11
|
+
from hud.eval.context import EvalContext
|
|
12
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MockEvalContext(EvalContext):
|
|
16
|
+
"""Mock EvalContext for testing agents.
|
|
17
|
+
|
|
18
|
+
This provides a minimal EvalContext implementation that can be used
|
|
19
|
+
to test agent initialization and tool calling without a real environment.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
prompt: str = "Test prompt",
|
|
25
|
+
tools: list[types.Tool] | None = None,
|
|
26
|
+
call_tool_handler: Any = None,
|
|
27
|
+
) -> None:
|
|
28
|
+
# Core attributes
|
|
29
|
+
self.prompt = prompt
|
|
30
|
+
self._tools = tools or []
|
|
31
|
+
self._submitted: str | None = None
|
|
32
|
+
self.reward: float | None = None
|
|
33
|
+
self._call_tool_handler = call_tool_handler
|
|
34
|
+
self.tool_calls: list[tuple[str, dict[str, Any]]] = []
|
|
35
|
+
|
|
36
|
+
# Environment attributes
|
|
37
|
+
self._router = ToolRouter()
|
|
38
|
+
self._agent_include: list[str] | None = None
|
|
39
|
+
self._agent_exclude: list[str] | None = None
|
|
40
|
+
|
|
41
|
+
# EvalContext attributes
|
|
42
|
+
self._task = None
|
|
43
|
+
self.trace_id = "test-trace-id"
|
|
44
|
+
self.eval_name = "test-eval"
|
|
45
|
+
self.job_id: str | None = None
|
|
46
|
+
self.group_id: str | None = None
|
|
47
|
+
self.index = 0
|
|
48
|
+
self.variants: dict[str, Any] = {}
|
|
49
|
+
self.answer: str | None = None
|
|
50
|
+
self.system_prompt: str | None = None
|
|
51
|
+
self.error: BaseException | None = None
|
|
52
|
+
self.metadata: dict[str, Any] = {}
|
|
53
|
+
self.results: list[Any] = []
|
|
54
|
+
self._is_summary = False
|
|
55
|
+
|
|
56
|
+
def as_tools(self) -> list[types.Tool]:
|
|
57
|
+
return self._tools
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def has_scenario(self) -> bool:
|
|
61
|
+
return False
|
|
62
|
+
|
|
63
|
+
async def list_tools(self) -> list[types.Tool]:
|
|
64
|
+
return self._tools
|
|
65
|
+
|
|
66
|
+
async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
|
|
67
|
+
# Parse the call
|
|
68
|
+
if isinstance(call, tuple):
|
|
69
|
+
name, args = call[0], call[1] if len(call) > 1 else {}
|
|
70
|
+
elif hasattr(call, "name"):
|
|
71
|
+
name, args = call.name, getattr(call, "arguments", {}) or {}
|
|
72
|
+
else:
|
|
73
|
+
name, args = str(call), kwargs
|
|
74
|
+
|
|
75
|
+
self.tool_calls.append((name, args))
|
|
76
|
+
|
|
77
|
+
if self._call_tool_handler:
|
|
78
|
+
tc = MCPToolCall(name=name, arguments=args)
|
|
79
|
+
return self._call_tool_handler(tc)
|
|
80
|
+
|
|
81
|
+
return MCPToolResult(
|
|
82
|
+
content=[types.TextContent(type="text", text=f"Result from {name}")],
|
|
83
|
+
isError=False,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
async def submit(self, answer: str) -> None:
|
|
87
|
+
self._submitted = answer
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@pytest.fixture
|
|
91
|
+
def mock_eval_context() -> MockEvalContext:
|
|
92
|
+
"""Create a basic mock EvalContext."""
|
|
93
|
+
return MockEvalContext()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@pytest.fixture
|
|
97
|
+
def mock_eval_context_with_tools() -> MockEvalContext:
|
|
98
|
+
"""Create a mock EvalContext with test tools."""
|
|
99
|
+
return MockEvalContext(
|
|
100
|
+
tools=[
|
|
101
|
+
types.Tool(
|
|
102
|
+
name="test_tool",
|
|
103
|
+
description="A test tool",
|
|
104
|
+
inputSchema={"type": "object", "properties": {}},
|
|
105
|
+
)
|
|
106
|
+
]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@pytest.fixture
|
|
111
|
+
def mock_eval_context_computer() -> MockEvalContext:
|
|
112
|
+
"""Create a mock EvalContext with computer tool."""
|
|
113
|
+
return MockEvalContext(
|
|
114
|
+
tools=[
|
|
115
|
+
types.Tool(
|
|
116
|
+
name="computer",
|
|
117
|
+
description="Computer use tool",
|
|
118
|
+
inputSchema={"type": "object"},
|
|
119
|
+
)
|
|
120
|
+
]
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@pytest.fixture
|
|
125
|
+
def mock_eval_context_browser_tools() -> MockEvalContext:
|
|
126
|
+
"""Create a mock EvalContext with browser-like tools."""
|
|
127
|
+
return MockEvalContext(
|
|
128
|
+
tools=[
|
|
129
|
+
types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
|
|
130
|
+
types.Tool(name="click", description="Click at coordinates", inputSchema={}),
|
|
131
|
+
types.Tool(name="type", description="Type text", inputSchema={}),
|
|
132
|
+
]
|
|
133
|
+
)
|