hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,233 @@
1
+ """Runtime tests for MCPAgent base class."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import mcp.types as types
8
+ import pytest
9
+
10
+ from hud.agents.base import BaseCreateParams, MCPAgent, find_content, find_reward, text_to_blocks
11
+ from hud.environment.router import ToolRouter
12
+ from hud.eval.context import EvalContext
13
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
14
+
15
+
16
+ class DummyConfig(BaseAgentConfig):
17
+ model_name: str = "DummyAgent"
18
+ model: str = "dummy-model"
19
+
20
+
21
+ class DummyCreateParams(BaseCreateParams, DummyConfig):
22
+ pass
23
+
24
+
25
+ class MockEvalContext(EvalContext):
26
+ """Mock EvalContext for testing."""
27
+
28
+ def __init__(
29
+ self,
30
+ prompt: str = "Test prompt",
31
+ tools: list[types.Tool] | None = None,
32
+ ) -> None:
33
+ # Core attributes
34
+ self.prompt = prompt
35
+ self._tools = tools or []
36
+ self._submitted: str | None = None
37
+ self.reward: float | None = None
38
+ self._call_tool_handler: Any = None
39
+
40
+ # Environment attributes
41
+ self._router = ToolRouter()
42
+ self._agent_include: list[str] | None = None
43
+ self._agent_exclude: list[str] | None = None
44
+
45
+ # EvalContext attributes
46
+ self._task = None
47
+ self.trace_id = "test-trace-id"
48
+ self.eval_name = "test-eval"
49
+ self.job_id: str | None = None
50
+ self.group_id: str | None = None
51
+ self.index = 0
52
+ self.variants: dict[str, Any] = {}
53
+ self.answer: str | None = None
54
+ self.system_prompt: str | None = None
55
+ self.error: BaseException | None = None
56
+ self.metadata: dict[str, Any] = {}
57
+ self.results: list[Any] = []
58
+ self._is_summary = False
59
+
60
+ def as_tools(self) -> list[types.Tool]:
61
+ return self._tools
62
+
63
+ @property
64
+ def has_scenario(self) -> bool:
65
+ return False
66
+
67
+ def set_call_tool_handler(self, handler: Any) -> None:
68
+ self._call_tool_handler = handler
69
+
70
+ async def list_tools(self) -> list[types.Tool]:
71
+ return self._tools
72
+
73
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
74
+ if self._call_tool_handler:
75
+ # Parse the call
76
+ if isinstance(call, tuple):
77
+ tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
78
+ elif hasattr(call, "name"):
79
+ tc = call
80
+ else:
81
+ tc = MCPToolCall(name=str(call), arguments=kwargs)
82
+ return self._call_tool_handler(tc)
83
+ return MCPToolResult(
84
+ content=[types.TextContent(type="text", text="ok")],
85
+ isError=False,
86
+ )
87
+
88
+ async def submit(self, answer: str) -> None:
89
+ self._submitted = answer
90
+
91
+
92
+ class DummyAgent(MCPAgent):
93
+ config_cls = DummyConfig
94
+
95
+ def __init__(self, **kwargs: Any) -> None:
96
+ params = DummyCreateParams(**kwargs)
97
+ super().__init__(params)
98
+
99
+ async def get_system_messages(self) -> list[types.ContentBlock]:
100
+ return [types.TextContent(type="text", text="sys")]
101
+
102
+ async def get_response(self, messages: list[Any]) -> AgentResponse:
103
+ return AgentResponse(content="ok", tool_calls=[], done=True)
104
+
105
+ async def format_blocks(self, blocks: list[Any]) -> list[Any]:
106
+ return blocks
107
+
108
+ async def format_tool_results(
109
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
110
+ ) -> list[Any]:
111
+ return [types.TextContent(text="tools", type="text")]
112
+
113
+
114
+ def test_find_reward_and_content_extractors() -> None:
115
+ """Test reward and content extraction from tool results."""
116
+ # Structured content
117
+ r = MCPToolResult(
118
+ content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
119
+ )
120
+ assert find_reward(r) == 0.7
121
+
122
+ # Text JSON
123
+ r2 = MCPToolResult(content=text_to_blocks('{"score": 0.5, "content": "hi"}'), isError=False)
124
+ assert find_reward(r2) == 0.5
125
+ assert find_content(r2) == "hi"
126
+
127
+
128
+ def test_get_available_tools_before_run_raises() -> None:
129
+ """Test that get_available_tools raises before initialization."""
130
+ agent = DummyAgent()
131
+ with pytest.raises(RuntimeError):
132
+ agent.get_available_tools()
133
+
134
+
135
+ @pytest.mark.asyncio
136
+ async def test_format_message_invalid_type_raises() -> None:
137
+ """Test that format_message raises for invalid types."""
138
+ agent = DummyAgent()
139
+ with pytest.raises(ValueError):
140
+ await agent.format_message({"oops": 1}) # type: ignore
141
+
142
+
143
+ def test_text_to_blocks_shapes() -> None:
144
+ """Test text_to_blocks returns correct structure."""
145
+ blocks = text_to_blocks("x")
146
+ assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
147
+
148
+
149
+ @pytest.mark.asyncio
150
+ async def test_run_with_eval_context() -> None:
151
+ """Test basic run() with EvalContext."""
152
+ ctx = MockEvalContext(prompt="hello")
153
+ agent = DummyAgent()
154
+ result = await agent.run(ctx, max_steps=1)
155
+ assert result.done is True
156
+ assert result.isError is False
157
+
158
+
159
+ @pytest.mark.asyncio
160
+ async def test_run_requires_eval_context() -> None:
161
+ """Test run() raises TypeError for non-EvalContext."""
162
+ agent = DummyAgent()
163
+ with pytest.raises(TypeError, match="must be EvalContext"):
164
+ await agent.run("hello") # type: ignore
165
+
166
+
167
+ @pytest.mark.asyncio
168
+ async def test_run_requires_prompt() -> None:
169
+ """Test run() raises ValueError when prompt is empty."""
170
+ ctx = MockEvalContext(prompt="")
171
+ agent = DummyAgent()
172
+ with pytest.raises(ValueError, match="prompt is not set"):
173
+ await agent.run(ctx)
174
+
175
+
176
+ @pytest.mark.asyncio
177
+ async def test_call_tools_error_paths() -> None:
178
+ """Test call_tools handles errors correctly."""
179
+ call_count = [0]
180
+ ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
181
+
182
+ def handler(tool_call: MCPToolCall) -> MCPToolResult:
183
+ call_count[0] += 1
184
+ if call_count[0] == 1:
185
+ return ok_result
186
+ raise RuntimeError("boom")
187
+
188
+ ctx = MockEvalContext(prompt="test")
189
+ ctx.set_call_tool_handler(handler)
190
+ agent = DummyAgent()
191
+
192
+ # Initialize the agent with context
193
+ agent.ctx = ctx
194
+ await agent._initialize_from_ctx(ctx)
195
+
196
+ results = await agent.call_tools(
197
+ [MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
198
+ )
199
+ assert results[0].isError is False
200
+ assert results[1].isError is True
201
+
202
+
203
+ @pytest.mark.asyncio
204
+ async def test_call_tools_timeout_raises() -> None:
205
+ """Test call_tools raises TimeoutError."""
206
+
207
+ def handler(tool_call: MCPToolCall) -> MCPToolResult:
208
+ raise TimeoutError("timeout")
209
+
210
+ ctx = MockEvalContext(prompt="test")
211
+ ctx.set_call_tool_handler(handler)
212
+ agent = DummyAgent()
213
+
214
+ agent.ctx = ctx
215
+ await agent._initialize_from_ctx(ctx)
216
+
217
+ with pytest.raises(TimeoutError):
218
+ await agent.call_tools(MCPToolCall(name="x", arguments={}))
219
+
220
+
221
+ @pytest.mark.asyncio
222
+ async def test_get_available_tools_after_run() -> None:
223
+ """Test get_available_tools works after initialization."""
224
+ tools = [types.Tool(name="test_tool", description="Test", inputSchema={})]
225
+ ctx = MockEvalContext(prompt="hello", tools=tools)
226
+ agent = DummyAgent()
227
+
228
+ # Run initializes the agent
229
+ await agent.run(ctx, max_steps=1)
230
+
231
+ # After cleanup, we can't access tools (ctx is cleared)
232
+ # But during run, tools were available
233
+ assert agent._initialized is True