hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,4 @@
1
- """Generic OpenAI chat-completions agent.
1
+ """OpenAI Chat Completions Agent.
2
2
 
3
3
  This class provides the minimal glue required to connect any endpoint that
4
4
  implements the OpenAI compatible *chat.completions* API with MCP tool calling
@@ -6,6 +6,7 @@ through the existing :class:`hud.agent.MCPAgent` scaffolding.
6
6
 
7
7
  Key points:
8
8
  - Stateless, no special server-side conversation state is assumed.
9
+ - Defaults to HUD inference gateway (inference.hud.ai) when HUD_API_KEY is set
9
10
  - Accepts an :class:`openai.AsyncOpenAI` client, caller can supply their own
10
11
  base_url / api_key (e.g. llama.cpp, together.ai, …)
11
12
  - All HUD features (step_count, OTel spans, tool filtering, screenshots, …)
@@ -20,39 +21,69 @@ import logging
20
21
  from typing import TYPE_CHECKING, Any, ClassVar, cast
21
22
 
22
23
  import mcp.types as types
24
+ from openai import AsyncOpenAI
23
25
 
24
- from hud import instrument
25
- from hud.types import AgentResponse, MCPToolCall, MCPToolResult
26
+ from hud.settings import settings
27
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
26
28
  from hud.utils.hud_console import HUDConsole
29
+ from hud.utils.types import with_signature
27
30
 
28
31
  from .base import MCPAgent
32
+ from .types import OpenAIChatConfig, OpenAIChatCreateParams
29
33
 
30
34
  if TYPE_CHECKING:
31
- from openai import AsyncOpenAI
32
35
  from openai.types.chat import ChatCompletionToolParam
33
36
 
37
+
34
38
  logger = logging.getLogger(__name__)
35
39
 
36
40
 
37
- class GenericOpenAIChatAgent(MCPAgent):
41
+ class OpenAIChatAgent(MCPAgent):
38
42
  """MCP-enabled agent that speaks the OpenAI *chat.completions* protocol."""
39
43
 
40
- metadata: ClassVar[dict[str, Any]] = {}
44
+ metadata: ClassVar[dict[str, Any] | None] = None
45
+ config_cls: ClassVar[type[BaseAgentConfig]] = OpenAIChatConfig
46
+
47
+ @with_signature(OpenAIChatCreateParams)
48
+ @classmethod
49
+ def create(cls, **kwargs: Any) -> OpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
50
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
51
+
52
+ def __init__(self, params: OpenAIChatCreateParams | None = None, **kwargs: Any) -> None:
53
+ super().__init__(params, **kwargs)
54
+ self.config: OpenAIChatConfig
55
+
56
+ if (
57
+ self.config.api_key
58
+ and self.config.base_url
59
+ and settings.hud_gateway_url in self.config.base_url
60
+ and settings.api_key
61
+ and self.config.api_key != settings.api_key
62
+ ):
63
+ raise ValueError(
64
+ "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
65
+ "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
66
+ )
41
67
 
42
- def __init__(
43
- self,
44
- *,
45
- openai_client: AsyncOpenAI | None,
46
- model_name: str = "gpt-4o-mini",
47
- completion_kwargs: dict[str, Any] | None = None,
48
- **agent_kwargs: Any,
49
- ) -> None:
50
- # Accept base-agent settings via **agent_kwargs (e.g., mcp_client, system_prompt, etc.)
51
- super().__init__(**agent_kwargs)
52
- self.oai = openai_client
53
- self.model_name = model_name
54
- self.completion_kwargs: dict[str, Any] = completion_kwargs or {}
55
- self.mcp_schemas = []
68
+ self.oai: AsyncOpenAI
69
+ if self.config.openai_client is not None:
70
+ self.oai = self.config.openai_client
71
+ elif self.config.api_key is not None or self.config.base_url is not None:
72
+ self.oai = AsyncOpenAI(api_key=self.config.api_key, base_url=self.config.base_url)
73
+ elif settings.api_key:
74
+ # Default to HUD inference gateway
75
+ self.oai = AsyncOpenAI(
76
+ api_key=settings.api_key,
77
+ base_url=settings.hud_gateway_url,
78
+ )
79
+ else:
80
+ raise ValueError(
81
+ "No API key found. Set HUD_API_KEY for HUD gateway, "
82
+ "or provide api_key/base_url/openai_client explicitly."
83
+ )
84
+
85
+ self.completion_kwargs = dict(self.config.completion_kwargs)
86
+ self.mcp_schemas: list[ChatCompletionToolParam] = []
56
87
  self.hud_console = HUDConsole(logger=logger)
57
88
 
58
89
  @staticmethod
@@ -69,11 +100,14 @@ class GenericOpenAIChatAgent(MCPAgent):
69
100
  arguments=args,
70
101
  )
71
102
 
72
- async def get_system_messages(self) -> list[Any]:
103
+ async def get_system_messages(self) -> list[dict[str, Any]]:
73
104
  """Get system messages for OpenAI."""
74
- return [{"role": "system", "content": self.system_prompt}]
105
+ if self.system_prompt is not None:
106
+ return [{"role": "system", "content": self.system_prompt}]
107
+ else:
108
+ return []
75
109
 
76
- async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[Any]:
110
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[dict[str, Any]]:
77
111
  """Format blocks for OpenAI."""
78
112
  content = []
79
113
  for block in blocks:
@@ -179,21 +213,16 @@ class GenericOpenAIChatAgent(MCPAgent):
179
213
  extra: dict[str, Any],
180
214
  ) -> Any:
181
215
  if self.oai is None:
182
- raise ValueError("openai_client is required for GenericOpenAIChatAgent")
216
+ raise ValueError("openai_client is required for OpenAIChatAgent")
183
217
  # default transport = OpenAI SDK
184
218
  return await self.oai.chat.completions.create(
185
- model=self.model_name,
219
+ model=self.config.model,
186
220
  messages=messages,
187
221
  tools=tools, # type: ignore ready ChatCompletionToolParam-shaped
188
222
  **extra,
189
223
  ) # type: ignore
190
224
 
191
- @instrument(
192
- span_type="agent",
193
- record_args=False,
194
- record_result=True,
195
- )
196
- async def get_response(self, messages: list[Any]) -> AgentResponse:
225
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
197
226
  """Send chat request to OpenAI and convert the response."""
198
227
 
199
228
  # Convert MCP tool schemas to OpenAI format
@@ -256,16 +285,17 @@ class GenericOpenAIChatAgent(MCPAgent):
256
285
 
257
286
  return AgentResponse(
258
287
  content=msg.content or "",
288
+ reasoning=getattr(msg, "reasoning_content", None),
259
289
  tool_calls=tool_calls,
260
290
  done=done,
261
- raw=response, # Include raw response for access to Choice objects
291
+ raw=response,
262
292
  )
263
293
 
264
294
  async def format_tool_results(
265
295
  self,
266
296
  tool_calls: list[MCPToolCall],
267
297
  tool_results: list[MCPToolResult],
268
- ) -> list[Any]:
298
+ ) -> list[dict[str, Any]]:
269
299
  """Render MCP tool results as OpenAI messages.
270
300
 
271
301
  Note: OpenAI tool messages only support string content.
hud/agents/operator.py ADDED
@@ -0,0 +1,199 @@
1
+ """Operator agent built on top of OpenAIAgent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any, ClassVar, Literal
6
+
7
+ import mcp.types as types
8
+ from openai.types.responses import (
9
+ ApplyPatchToolParam,
10
+ ComputerToolParam,
11
+ FunctionShellToolParam,
12
+ FunctionToolParam,
13
+ ResponseComputerToolCallOutputScreenshotParam,
14
+ )
15
+ from openai.types.responses.response_input_param import (
16
+ ComputerCallOutput,
17
+ FunctionCallOutput,
18
+ )
19
+ from openai.types.shared_params.reasoning import Reasoning
20
+
21
+ from hud.tools.computer.settings import computer_settings
22
+ from hud.types import BaseAgentConfig, MCPToolCall, MCPToolResult
23
+ from hud.utils.types import with_signature
24
+
25
+ from .base import MCPAgent
26
+ from .openai import OpenAIAgent
27
+ from .types import OperatorConfig, OperatorCreateParams
28
+
29
+ if TYPE_CHECKING:
30
+ from openai.types.responses.response_computer_tool_call import PendingSafetyCheck
31
+
32
+ OPERATOR_INSTRUCTIONS = """
33
+ You are an autonomous computer-using agent. Follow these guidelines:
34
+
35
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
36
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to
37
+ continue?" - just proceed.
38
+ 3. When the user asks you to interact with something (like clicking a chat or typing
39
+ a message), DO IT without asking.
40
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like
41
+ deleting important files).
42
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms -
43
+ JUST DO IT.
44
+ 6. The user has already given you permission by running this agent. No further
45
+ confirmation is needed.
46
+ 7. Be decisive and action-oriented. Complete the requested task fully.
47
+
48
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do
49
+ what they asked.
50
+ """.strip()
51
+
52
+
53
+ class OperatorAgent(OpenAIAgent):
54
+ """
55
+ Backwards-compatible Operator agent built on top of OpenAIAgent.
56
+ """
57
+
58
+ metadata: ClassVar[dict[str, Any] | None] = {
59
+ "display_width": computer_settings.OPENAI_COMPUTER_WIDTH,
60
+ "display_height": computer_settings.OPENAI_COMPUTER_HEIGHT,
61
+ }
62
+ # base class will ensure that the computer tool is available
63
+ required_tools: ClassVar[list[str]] = ["openai_computer"]
64
+ config_cls: ClassVar[type[BaseAgentConfig]] = OperatorConfig
65
+
66
+ @with_signature(OperatorCreateParams)
67
+ @classmethod
68
+ def create(cls, **kwargs: Any) -> OperatorAgent: # pyright: ignore[reportIncompatibleMethodOverride]
69
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
70
+
71
+ def __init__(self, params: OperatorCreateParams | None = None, **kwargs: Any) -> None:
72
+ super().__init__(params, **kwargs) # type: ignore[arg-type]
73
+ self.config: OperatorConfig # type: ignore[assignment]
74
+
75
+ self._operator_computer_tool_name = "openai_computer"
76
+ self._operator_display_width = computer_settings.OPENAI_COMPUTER_WIDTH
77
+ self._operator_display_height = computer_settings.OPENAI_COMPUTER_HEIGHT
78
+ self._operator_environment: Literal["windows", "mac", "linux", "ubuntu", "browser"] = (
79
+ self.config.environment
80
+ )
81
+ self.environment = self.config.environment
82
+
83
+ # add pending call id and safety checks to the agent
84
+ self.pending_call_id: str | None = None
85
+ self.pending_safety_checks: list[PendingSafetyCheck] = []
86
+
87
+ # override reasoning to "summary": "auto"
88
+ if self.reasoning is None:
89
+ self.reasoning = Reasoning(summary="auto")
90
+ else:
91
+ self.reasoning["summary"] = "auto"
92
+
93
+ # override truncation to "auto"
94
+ self.truncation = "auto"
95
+
96
+ if self.system_prompt:
97
+ self.system_prompt = f"{self.system_prompt}\n\n{OPERATOR_INSTRUCTIONS}"
98
+ else:
99
+ self.system_prompt = OPERATOR_INSTRUCTIONS
100
+
101
+ def _reset_response_state(self) -> None:
102
+ super()._reset_response_state()
103
+ self.pending_call_id = None
104
+ self.pending_safety_checks = []
105
+
106
+ def _to_openai_tool(
107
+ self, tool: types.Tool
108
+ ) -> (
109
+ FunctionShellToolParam | ApplyPatchToolParam | FunctionToolParam | ComputerToolParam | None
110
+ ):
111
+ if tool.name == self._operator_computer_tool_name:
112
+ return ComputerToolParam(
113
+ type="computer_use_preview",
114
+ display_width=self._operator_display_width,
115
+ display_height=self._operator_display_height,
116
+ environment=self._operator_environment,
117
+ )
118
+ if tool.name == "computer" or tool.name.endswith("_computer"):
119
+ return None
120
+ return super()._to_openai_tool(tool)
121
+
122
+ def _extract_tool_call(self, item: Any) -> MCPToolCall | None:
123
+ """Route computer_call to the OpenAI-specific computer tool."""
124
+ if item.type == "computer_call":
125
+ self.pending_safety_checks = item.pending_safety_checks
126
+ return MCPToolCall(
127
+ name=self._operator_computer_tool_name,
128
+ arguments=item.action.to_dict(),
129
+ id=item.call_id,
130
+ )
131
+ return super()._extract_tool_call(item)
132
+
133
+ async def format_tool_results(
134
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
135
+ ) -> list[ComputerCallOutput | FunctionCallOutput]:
136
+ remaining_calls: list[MCPToolCall] = []
137
+ remaining_results: list[MCPToolResult] = []
138
+ computer_outputs: list[ComputerCallOutput] = []
139
+ ordering: list[tuple[str, int]] = []
140
+
141
+ for call, result in zip(tool_calls, tool_results, strict=False):
142
+ if call.name == self._operator_computer_tool_name:
143
+ screenshot = self._extract_latest_screenshot(result)
144
+ if not screenshot:
145
+ self.console.warning_log(
146
+ "Computer tool result missing screenshot; skipping output."
147
+ )
148
+ continue
149
+ call_id = call.id or self.pending_call_id
150
+ if not call_id:
151
+ self.console.warning_log("Computer tool call missing ID; skipping output.")
152
+ continue
153
+ acknowledged_checks = []
154
+ for check in self.pending_safety_checks:
155
+ if hasattr(check, "model_dump"):
156
+ acknowledged_checks.append(check.model_dump())
157
+ elif isinstance(check, dict):
158
+ acknowledged_checks.append(check)
159
+ output_payload = ComputerCallOutput(
160
+ type="computer_call_output",
161
+ call_id=call_id,
162
+ output=ResponseComputerToolCallOutputScreenshotParam(
163
+ type="computer_screenshot",
164
+ image_url=f"data:image/png;base64,{screenshot}",
165
+ ),
166
+ acknowledged_safety_checks=acknowledged_checks if acknowledged_checks else None,
167
+ )
168
+ computer_outputs.append(output_payload)
169
+ self.pending_call_id = None
170
+ self.pending_safety_checks = []
171
+ ordering.append(("computer", len(computer_outputs) - 1))
172
+ else:
173
+ remaining_calls.append(call)
174
+ remaining_results.append(result)
175
+ ordering.append(("function", len(remaining_calls) - 1))
176
+
177
+ formatted: list[ComputerCallOutput | FunctionCallOutput] = []
178
+ function_outputs: list[FunctionCallOutput] = []
179
+ if remaining_calls:
180
+ function_outputs = await super().format_tool_results(remaining_calls, remaining_results)
181
+
182
+ for kind, idx in ordering:
183
+ if kind == "computer":
184
+ if idx < len(computer_outputs):
185
+ formatted.append(computer_outputs[idx])
186
+ else:
187
+ if idx < len(function_outputs):
188
+ formatted.append(function_outputs[idx])
189
+ return formatted
190
+
191
+ def _extract_latest_screenshot(self, result: MCPToolResult) -> str | None:
192
+ if not result.content:
193
+ return None
194
+ for content in reversed(result.content):
195
+ if isinstance(content, types.ImageContent):
196
+ return content.data
197
+ if isinstance(content, types.TextContent) and result.isError:
198
+ self.console.error_log(f"Computer tool error: {content.text}")
199
+ return None
hud/agents/resolver.py ADDED
@@ -0,0 +1,70 @@
1
+ """Model resolution - maps model strings to agent classes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ if TYPE_CHECKING:
8
+ from hud.agents.base import MCPAgent
9
+
10
+ __all__ = ["resolve_cls"]
11
+
12
+ _models_cache: list[dict[str, Any]] | None = None
13
+
14
+ # Provider name → AgentType value (only anthropic differs)
15
+ _PROVIDER_TO_AGENT = {"anthropic": "claude"}
16
+
17
+
18
+ def _fetch_gateway_models() -> list[dict[str, Any]]:
19
+ """Fetch available models from HUD gateway (cached)."""
20
+ global _models_cache
21
+ if _models_cache is not None:
22
+ return _models_cache
23
+
24
+ import httpx
25
+
26
+ from hud.settings import settings
27
+
28
+ if not settings.api_key:
29
+ return []
30
+
31
+ try:
32
+ resp = httpx.get(
33
+ f"{settings.hud_gateway_url}/models",
34
+ headers={"Authorization": f"Bearer {settings.api_key}"},
35
+ timeout=10.0,
36
+ )
37
+ resp.raise_for_status()
38
+ data = resp.json()
39
+ _models_cache = data.get("data", data) if isinstance(data, dict) else data
40
+ return _models_cache or []
41
+ except Exception:
42
+ return []
43
+
44
+
45
+ def resolve_cls(model: str) -> tuple[type[MCPAgent], dict[str, Any] | None]:
46
+ """Resolve model string to (agent_class, gateway_info).
47
+
48
+ Returns:
49
+ (agent_class, None) for known AgentTypes
50
+ (agent_class, gateway_model_info) for gateway models
51
+ """
52
+ from hud.types import AgentType
53
+
54
+ # Known AgentType → no gateway info
55
+ try:
56
+ return AgentType(model).cls, None
57
+ except ValueError:
58
+ pass
59
+
60
+ # Gateway lookup
61
+ for m in _fetch_gateway_models():
62
+ if model in (m.get("id"), m.get("name"), m.get("model")):
63
+ provider = (m.get("provider") or "openai_compatible").lower()
64
+ agent_str = _PROVIDER_TO_AGENT.get(provider, provider)
65
+ try:
66
+ return AgentType(agent_str).cls, m
67
+ except ValueError:
68
+ return AgentType.OPENAI_COMPATIBLE.cls, m
69
+
70
+ raise ValueError(f"Model '{model}' not found")
@@ -0,0 +1,133 @@
1
+ """Shared test fixtures for agent tests."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import pytest
8
+ from mcp import types
9
+
10
+ from hud.environment.router import ToolRouter
11
+ from hud.eval.context import EvalContext
12
+ from hud.types import MCPToolCall, MCPToolResult
13
+
14
+
15
+ class MockEvalContext(EvalContext):
16
+ """Mock EvalContext for testing agents.
17
+
18
+ This provides a minimal EvalContext implementation that can be used
19
+ to test agent initialization and tool calling without a real environment.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ prompt: str = "Test prompt",
25
+ tools: list[types.Tool] | None = None,
26
+ call_tool_handler: Any = None,
27
+ ) -> None:
28
+ # Core attributes
29
+ self.prompt = prompt
30
+ self._tools = tools or []
31
+ self._submitted: str | None = None
32
+ self.reward: float | None = None
33
+ self._call_tool_handler = call_tool_handler
34
+ self.tool_calls: list[tuple[str, dict[str, Any]]] = []
35
+
36
+ # Environment attributes
37
+ self._router = ToolRouter()
38
+ self._agent_include: list[str] | None = None
39
+ self._agent_exclude: list[str] | None = None
40
+
41
+ # EvalContext attributes
42
+ self._task = None
43
+ self.trace_id = "test-trace-id"
44
+ self.eval_name = "test-eval"
45
+ self.job_id: str | None = None
46
+ self.group_id: str | None = None
47
+ self.index = 0
48
+ self.variants: dict[str, Any] = {}
49
+ self.answer: str | None = None
50
+ self.system_prompt: str | None = None
51
+ self.error: BaseException | None = None
52
+ self.metadata: dict[str, Any] = {}
53
+ self.results: list[Any] = []
54
+ self._is_summary = False
55
+
56
+ def as_tools(self) -> list[types.Tool]:
57
+ return self._tools
58
+
59
+ @property
60
+ def has_scenario(self) -> bool:
61
+ return False
62
+
63
+ async def list_tools(self) -> list[types.Tool]:
64
+ return self._tools
65
+
66
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
67
+ # Parse the call
68
+ if isinstance(call, tuple):
69
+ name, args = call[0], call[1] if len(call) > 1 else {}
70
+ elif hasattr(call, "name"):
71
+ name, args = call.name, getattr(call, "arguments", {}) or {}
72
+ else:
73
+ name, args = str(call), kwargs
74
+
75
+ self.tool_calls.append((name, args))
76
+
77
+ if self._call_tool_handler:
78
+ tc = MCPToolCall(name=name, arguments=args)
79
+ return self._call_tool_handler(tc)
80
+
81
+ return MCPToolResult(
82
+ content=[types.TextContent(type="text", text=f"Result from {name}")],
83
+ isError=False,
84
+ )
85
+
86
+ async def submit(self, answer: str) -> None:
87
+ self._submitted = answer
88
+
89
+
90
+ @pytest.fixture
91
+ def mock_eval_context() -> MockEvalContext:
92
+ """Create a basic mock EvalContext."""
93
+ return MockEvalContext()
94
+
95
+
96
+ @pytest.fixture
97
+ def mock_eval_context_with_tools() -> MockEvalContext:
98
+ """Create a mock EvalContext with test tools."""
99
+ return MockEvalContext(
100
+ tools=[
101
+ types.Tool(
102
+ name="test_tool",
103
+ description="A test tool",
104
+ inputSchema={"type": "object", "properties": {}},
105
+ )
106
+ ]
107
+ )
108
+
109
+
110
+ @pytest.fixture
111
+ def mock_eval_context_computer() -> MockEvalContext:
112
+ """Create a mock EvalContext with computer tool."""
113
+ return MockEvalContext(
114
+ tools=[
115
+ types.Tool(
116
+ name="computer",
117
+ description="Computer use tool",
118
+ inputSchema={"type": "object"},
119
+ )
120
+ ]
121
+ )
122
+
123
+
124
+ @pytest.fixture
125
+ def mock_eval_context_browser_tools() -> MockEvalContext:
126
+ """Create a mock EvalContext with browser-like tools."""
127
+ return MockEvalContext(
128
+ tools=[
129
+ types.Tool(name="screenshot", description="Take screenshot", inputSchema={}),
130
+ types.Tool(name="click", description="Click at coordinates", inputSchema={}),
131
+ types.Tool(name="type", description="Type text", inputSchema={}),
132
+ ]
133
+ )