hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- """Generic OpenAI chat-completions agent.
1
+ """OpenAI Chat Completions Agent.
2
2
 
3
3
  This class provides the minimal glue required to connect any endpoint that
4
4
  implements the OpenAI compatible *chat.completions* API with MCP tool calling
@@ -6,6 +6,7 @@ through the existing :class:`hud.agent.MCPAgent` scaffolding.
6
6
 
7
7
  Key points:
8
8
  - Stateless, no special server-side conversation state is assumed.
9
+ - Defaults to HUD inference gateway (inference.hud.ai) when HUD_API_KEY is set
9
10
  - Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
10
11
  base_url / api_key (e.g. llama.cpp, together.ai, …)
11
12
  - All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
@@ -20,39 +21,85 @@ import logging
20
21
  from typing import TYPE_CHECKING, Any, ClassVar, cast
21
22
 
22
23
  import mcp.types as types
24
+ from openai import AsyncOpenAI
25
+ from pydantic import ConfigDict, Field
23
26
 
24
- from hud import instrument
25
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult
27
+ from hud.settings import settings
28
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
26
29
  from hud.utils.hud_console import HUDConsole
30
+ from hud.utils.types import with_signature
27
31
 
28
- from .base import MCPAgent
32
+ from .base import BaseCreateParams, MCPAgent
29
33
 
30
34
  if TYPE_CHECKING:
31
- from openai import AsyncOpenAI
32
35
  from openai.types.chat import ChatCompletionToolParam
33
36
 
37
+
34
38
  logger = logging.getLogger(__name__)
35
39
 
36
40
 
37
- class GenericOpenAIChatAgent(MCPAgent):
41
+ class OpenAIChatConfig(BaseAgentConfig):
42
+ """Configuration for `OpenAIChatAgent`."""
43
+
44
+ model_config = ConfigDict(arbitrary_types_allowed=True)
45
+
46
+ model_name: str = "OpenAI Chat"
47
+ model: str = "gpt-5-mini"
48
+ openai_client: AsyncOpenAI | None = None
49
+ api_key: str | None = None
50
+ base_url: str | None = None
51
+ completion_kwargs: dict[str, Any] = Field(default_factory=dict)
52
+
53
+
54
+ class OpenAIChatCreateParams(BaseCreateParams, OpenAIChatConfig):
55
+ pass
56
+
57
+
58
+ class OpenAIChatAgent(MCPAgent):
38
59
  """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
39
60
 
40
- metadata: ClassVar[dict[str, Any]] = {}
61
+ metadata: ClassVar[dict[str, Any] | None] = None
62
+ config_cls: ClassVar[type[BaseAgentConfig]] = OpenAIChatConfig
63
+
64
+ @with_signature(OpenAIChatCreateParams)
65
+ @classmethod
66
+ def create(cls, **kwargs: Any) -> OpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
67
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
68
+
69
+ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) -> None:
70
+ super().__init__(params, **kwargs)
71
+ self.config: OpenAIChatConfig
72
+
73
+ if (
74
+ self.config.api_key
75
+ and self.config.base_url
76
+ and settings.hud_gateway_url in self.config.base_url
77
+ and settings.api_key
78
+ and self.config.api_key != settings.api_key
79
+ ):
80
+ raise ValueError(
81
+ "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
82
+ "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
83
+ )
41
84
 
42
- def __init__(
43
- self,
44
- *,
45
- openai_client: AsyncOpenAI | None,
46
- model_name: str = "gpt-4o-mini",
47
- completion_kwargs: dict[str, Any] | None = None,
48
- **agent_kwargs: Any,
49
- ) -> None:
50
- # Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
51
- super().__init__(**agent_kwargs)
52
- self.oai = openai_client
53
- self.model_name = model_name
54
- self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
55
- self.mcp_schemas = []
85
+ if self.config.openai_client is not None:
86
+ self.oai = self.config.openai_client
87
+ elif self.config.api_key is not None or self.config.base_url is not None:
88
+ self.oai = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
89
+ elif settings.api_key:
90
+ # Default to HUD inference gateway
91
+ self.oai = AsyncOpenAI(
92
+ api_key=settings.api_key,
93
+ base_url=settings.hud_gateway_url,
94
+ )
95
+ else:
96
+ raise ValueError(
97
+ "No API key found. Set HUD_API_KEY for HUD gateway, "
98
+ "or provide api_key/base_url/openai_client explicitly."
99
+ )
100
+
101
+ self.completion_kwargs = dict(self.config.completion_kwargs)
102
+ self.mcp_schemas: list[ChatCompletionToolParam] = []
56
103
  self.hud_console = HUDConsole(logger=logger)
57
104
 
58
105
  @staticmethod
@@ -69,11 +116,14 @@ class GenericOpenAIChatAgent(MCPAgent):
69
116
  arguments=args,
70
117
  )
71
118
 
72
- async def get_system_messages(self) -> list[Any]:
119
+ async def get_system_messages(self) -> list[dict[str, Any]]:
73
120
  """Get system messages for OpenAI."""
74
- return [{"role": "system", "content": self.system_prompt}]
121
+ if self.system_prompt is not None:
122
+ return [{"role": "system", "content": self.system_prompt}]
123
+ else:
124
+ return []
75
125
 
76
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
126
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
77
127
  """Format blocks for OpenAI."""
78
128
  content = []
79
129
  for block in blocks:
@@ -179,21 +229,16 @@ class GenericOpenAIChatAgent(MCPAgent):
179
229
  extra: dict[str, Any],
180
230
  ) -> Any:
181
231
  if self.oai is None:
182
- raise ValueError("openai_client is required for GenericOpenAIChatAgent")
232
+ raise ValueError("openai_client is required for OpenAIChatAgent")
183
233
  # default transport = OpenAI SDK
184
234
  return await self.oai.chat.completions.create(
185
- model=self.model_name,
235
+ model=self.config.model,
186
236
  messages=messages,
187
237
  tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
188
238
  **extra,
189
239
  ) # type: ignore
190
240
 
191
- @instrument(
192
- span_type="agent",
193
- record_args=False,
194
- record_result=True,
195
- )
196
- async def get_response(self, messages: list[Any]) -> AgentResponse:
241
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
197
242
  """Send chat request to OpenAI and convert the response."""
198
243
 
199
244
  # Convert MCP tool schemas to OpenAI format
@@ -256,16 +301,17 @@ class GenericOpenAIChatAgent(MCPAgent):
256
301
 
257
302
  return AgentResponse(
258
303
  content=msg.content or "",
304
+ reasoning=getattr(msg, "reasoning_content", None),
259
305
  tool_calls=tool_calls,
260
306
  done=done,
261
- raw=response, # Include raw response for access to Choice objects
307
+ raw=response,
262
308
  )
263
309
 
264
310
  async def format_tool_results(
265
311
  self,
266
312
  tool_calls: list[MCPToolCall],
267
313
  tool_results: list[MCPToolResult],
268
- ) -> list[Any]:
314
+ ) -> list[dict[str, Any]]:
269
315
  """Render MCP tool results as OpenAI messages.
270
316
 
271
317
  Note: OpenAI tool messages only support string content.
hud/agents/operator.py ADDED
@@ -0,0 +1,211 @@
1
+ """Operator agent built on top of OpenAIAgent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal
6
+
7
+ import mcp.types as types
8
+ from openai.types.responses import (
9
+ ApplyPatchToolParam,
10
+ ComputerToolParam,
11
+ FunctionShellToolParam,
12
+ FunctionToolParam,
13
+ ResponseComputerToolCallOutputScreenshotParam,
14
+ )
15
+ from openai.types.responses.response_input_param import (
16
+ ComputerCallOutput,
17
+ FunctionCallOutput,
18
+ )
19
+ from openai.types.shared_params.reasoning import Reasoning
20
+ from pydantic import ConfigDict
21
+
22
+ from hud.tools.computer.settings import computer_settings
23
+ from hud.types import BaseAgentConfig, MCPToolCall, MCPToolResult
24
+ from hud.utils.types import with_signature
25
+
26
+ from .base import BaseCreateParams, MCPAgent
27
+ from .openai import OpenAIAgent, OpenAIConfig
28
+
29
+ if TYPE_CHECKING:
30
+ from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
31
+
32
+ OPERATOR_INSTRUCTIONS = """
33
+ You are an autonomous computer-using agent. Follow these guidelines:
34
+
35
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
36
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to
37
+ continue?" - just proceed.
38
+ 3. When the user asks you to interact with something (like clicking a chat or typing
39
+ a message), DO IT without asking.
40
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like
41
+ deleting important files).
42
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms -
43
+ JUST DO IT.
44
+ 6. The user has already given you permission by running this agent. No further
45
+ confirmation is needed.
46
+ 7. Be decisive and action-oriented. Complete the requested task fully.
47
+
48
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do
49
+ what they asked.
50
+ """.strip()
51
+
52
+
53
+ class OperatorConfig(OpenAIConfig):
54
+ """Configuration model for `OperatorAgent`."""
55
+
56
+ model_config = ConfigDict(arbitrary_types_allowed=True)
57
+
58
+ model_name: str = "Operator"
59
+ model: str = "computer-use-preview"
60
+ environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = "linux"
61
+
62
+
63
+ class OperatorCreateParams(BaseCreateParams, OperatorConfig):
64
+ pass
65
+
66
+
67
+ class OperatorAgent(OpenAIAgent):
68
+ """
69
+ Backwards-compatible Operator agent built on top of OpenAIAgent.
70
+ """
71
+
72
+ metadata: ClassVar[dict[str, Any] | None] = {
73
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
74
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
75
+ }
76
+ # base class will ensure that the computer tool is available
77
+ required_tools: ClassVar[list[str]] = ["openai_computer"]
78
+ config_cls: ClassVar[type[BaseAgentConfig]] = OperatorConfig
79
+
80
+ @with_signature(OperatorCreateParams)
81
+ @classmethod
82
+ def create(cls, **kwargs: Any) -> OperatorAgent: # pyright: ignore[reportIncompatibleMethodOverride]
83
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
84
+
85
+ def __init__(self, params: OperatorCreateParams | None = None, **kwargs: Any) -> None:
86
+ super().__init__(params, **kwargs) # type: ignore[arg-type]
87
+ self.config: OperatorConfig # type: ignore[assignment]
88
+
89
+ self._operator_computer_tool_name = "openai_computer"
90
+ self._operator_display_width = computer_settings.OPENAI_COMPUTER_WIDTH
91
+ self._operator_display_height = computer_settings.OPENAI_COMPUTER_HEIGHT
92
+ self._operator_environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = (
93
+ self.config.environment
94
+ )
95
+ self.environment = self.config.environment
96
+
97
+ # add pending call id and safety checks to the agent
98
+ self.pending_call_id: str | None = None
99
+ self.pending_safety_checks: list[PendingSafetyCheck] = []
100
+
101
+ # override reasoning to "summary": "auto"
102
+ if self.reasoning is None:
103
+ self.reasoning = Reasoning(summary="auto")
104
+ else:
105
+ self.reasoning["summary"] = "auto"
106
+
107
+ # override truncation to "auto"
108
+ self.truncation = "auto"
109
+
110
+ if self.system_prompt:
111
+ self.system_prompt = f"{self.system_prompt}\n\n{OPERATOR_INSTRUCTIONS}"
112
+ else:
113
+ self.system_prompt = OPERATOR_INSTRUCTIONS
114
+
115
+ def _reset_response_state(self) -> None:
116
+ super()._reset_response_state()
117
+ self.pending_call_id = None
118
+ self.pending_safety_checks = []
119
+
120
+ def _to_openai_tool(
121
+ self, tool: types.Tool
122
+ ) -> (
123
+ FunctionShellToolParam | ApplyPatchToolParam | FunctionToolParam | ComputerToolParam | None
124
+ ):
125
+ if tool.name == self._operator_computer_tool_name:
126
+ return ComputerToolParam(
127
+ type="computer_use_preview",
128
+ display_width=self._operator_display_width,
129
+ display_height=self._operator_display_height,
130
+ environment=self._operator_environment,
131
+ )
132
+ return super()._to_openai_tool(tool)
133
+
134
+ def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
135
+ """Route computer_call to the OpenAI-specific computer tool."""
136
+ if item.type == "computer_call":
137
+ self.pending_safety_checks = item.pending_safety_checks
138
+ return MCPToolCall(
139
+ name=self._operator_computer_tool_name,
140
+ arguments=item.action.to_dict(),
141
+ id=item.call_id,
142
+ )
143
+ return super()._extract_tool_call(item)
144
+
145
+ async def format_tool_results(
146
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
147
+ ) -> list[ComputerCallOutput | FunctionCallOutput]:
148
+ remaining_calls: list[MCPToolCall] = []
149
+ remaining_results: list[MCPToolResult] = []
150
+ computer_outputs: list[ComputerCallOutput] = []
151
+ ordering: list[tuple[str, int]] = []
152
+
153
+ for call, result in zip(tool_calls, tool_results, strict=False):
154
+ if call.name == self._operator_computer_tool_name:
155
+ screenshot = self._extract_latest_screenshot(result)
156
+ if not screenshot:
157
+ self.console.warning_log(
158
+ "Computer tool result missing screenshot; skipping output."
159
+ )
160
+ continue
161
+ call_id = call.id or self.pending_call_id
162
+ if not call_id:
163
+ self.console.warning_log("Computer tool call missing ID; skipping output.")
164
+ continue
165
+ acknowledged_checks = []
166
+ for check in self.pending_safety_checks:
167
+ if hasattr(check, "model_dump"):
168
+ acknowledged_checks.append(check.model_dump())
169
+ elif isinstance(check, dict):
170
+ acknowledged_checks.append(check)
171
+ output_payload = ComputerCallOutput(
172
+ type="computer_call_output",
173
+ call_id=call_id,
174
+ output=ResponseComputerToolCallOutputScreenshotParam(
175
+ type="computer_screenshot",
176
+ image_url=f"data:image/png;base64,{screenshot}",
177
+ ),
178
+ acknowledged_safety_checks=acknowledged_checks if acknowledged_checks else None,
179
+ )
180
+ computer_outputs.append(output_payload)
181
+ self.pending_call_id = None
182
+ self.pending_safety_checks = []
183
+ ordering.append(("computer", len(computer_outputs) - 1))
184
+ else:
185
+ remaining_calls.append(call)
186
+ remaining_results.append(result)
187
+ ordering.append(("function", len(remaining_calls) - 1))
188
+
189
+ formatted: list[ComputerCallOutput | FunctionCallOutput] = []
190
+ function_outputs: list[FunctionCallOutput] = []
191
+ if remaining_calls:
192
+ function_outputs = await super().format_tool_results(remaining_calls, remaining_results)
193
+
194
+ for kind, idx in ordering:
195
+ if kind == "computer":
196
+ if idx < len(computer_outputs):
197
+ formatted.append(computer_outputs[idx])
198
+ else:
199
+ if idx < len(function_outputs):
200
+ formatted.append(function_outputs[idx])
201
+ return formatted
202
+
203
+ def _extract_latest_screenshot(self, result: MCPToolResult) -> str | None:
204
+ if not result.content:
205
+ return None
206
+ for content in reversed(result.content):
207
+ if isinstance(content, types.ImageContent):
208
+ return content.data
209
+ if isinstance(content, types.TextContent) and result.isError:
210
+ self.console.error_log(f"Computer tool error: {content.text}")
211
+ return None
@@ -0,0 +1,133 @@
1
+ """Shared test fixtures for agent tests."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pytest
8
+ from mcp import types
9
+
10
+ from hud.environment.router import ToolRouter
11
+ from hud.eval.context import EvalContext
12
+ from hud.types import MCPToolCall, MCPToolResult
13
+
14
+
15
+ class MockEvalContext(EvalContext):
16
+ """Mock EvalContext for testing agents.
17
+
18
+ This provides a minimal EvalContext implementation that can be used
19
+ to test agent initialization and tool calling without a real environment.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ prompt: str = "Test prompt",
25
+ tools: list[types.Tool] | None = None,
26
+ call_tool_handler: Any = None,
27
+ ) -> None:
28
+ # Core attributes
29
+ self.prompt = prompt
30
+ self._tools = tools or []
31
+ self._submitted: str | None = None
32
+ self.reward: float | None = None
33
+ self._call_tool_handler = call_tool_handler
34
+ self.tool_calls: list[tuple[str, dict[str, Any]]] = []
35
+
36
+ # Environment attributes
37
+ self._router = ToolRouter()
38
+ self._agent_include: list[str] | None = None
39
+ self._agent_exclude: list[str] | None = None
40
+
41
+ # EvalContext attributes
42
+ self._task = None
43
+ self.trace_id = "test-trace-id"
44
+ self.eval_name = "test-eval"
45
+ self.job_id: str | None = None
46
+ self.group_id: str | None = None
47
+ self.index = 0
48
+ self.variants: dict[str, Any] = {}
49
+ self.answer: str | None = None
50
+ self.system_prompt: str | None = None
51
+ self.error: BaseException | None = None
52
+ self.metadata: dict[str, Any] = {}
53
+ self.results: list[Any] = []
54
+ self._is_summary = False
55
+
56
+ def as_tools(self) -> list[types.Tool]:
57
+ return self._tools
58
+
59
+ @property
60
+ def has_scenario(self) -> bool:
61
+ return False
62
+
63
+ async def list_tools(self) -> list[types.Tool]:
64
+ return self._tools
65
+
66
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
67
+ # Parse the call
68
+ if isinstance(call, tuple):
69
+ name, args = call[0], call[1] if len(call) > 1 else {}
70
+ elif hasattr(call, "name"):
71
+ name, args = call.name, getattr(call, "arguments", {}) or {}
72
+ else:
73
+ name, args = str(call), kwargs
74
+
75
+ self.tool_calls.append((name, args))
76
+
77
+ if self._call_tool_handler:
78
+ tc = MCPToolCall(name=name, arguments=args)
79
+ return self._call_tool_handler(tc)
80
+
81
+ return MCPToolResult(
82
+ content=[types.TextContent(type="text", text=f"Result from {name}")],
83
+ isError=False,
84
+ )
85
+
86
+ async def submit(self, answer: str) -> None:
87
+ self._submitted = answer
88
+
89
+
90
+ @pytest.fixture
91
+ def mock_eval_context() -> MockEvalContext:
92
+ """Create a basic mock EvalContext."""
93
+ return MockEvalContext()
94
+
95
+
96
+ @pytest.fixture
97
+ def mock_eval_context_with_tools() -> MockEvalContext:
98
+ """Create a mock EvalContext with test tools."""
99
+ return MockEvalContext(
100
+ tools=[
101
+ types.Tool(
102
+ name="test_tool",
103
+ description="A test tool",
104
+ inputSchema={"type": "object", "properties": {}},
105
+ )
106
+ ]
107
+ )
108
+
109
+
110
+ @pytest.fixture
111
+ def mock_eval_context_computer() -> MockEvalContext:
112
+ """Create a mock EvalContext with computer tool."""
113
+ return MockEvalContext(
114
+ tools=[
115
+ types.Tool(
116
+ name="computer",
117
+ description="Computer use tool",
118
+ inputSchema={"type": "object"},
119
+ )
120
+ ]
121
+ )
122
+
123
+
124
+ @pytest.fixture
125
+ def mock_eval_context_browser_tools() -> MockEvalContext:
126
+ """Create a mock EvalContext with browser-like tools."""
127
+ return MockEvalContext(
128
+ tools=[
129
+ types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
130
+ types.Tool(name="click", description="Click at coordinates", inputSchema={}),
131
+ types.Tool(name="type", description="Type text", inputSchema={}),
132
+ ]
133
+ )