hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,401 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import pytest
6
+
7
+ from hud.telemetry.instrument import _serialize_value, instrument
8
+
9
+
10
+ def test_serialize_value_simple_types():
11
+ """Test _serialize_value with simple types."""
12
+ assert _serialize_value("string") == "string"
13
+ assert _serialize_value(42) == 42
14
+ assert _serialize_value(3.14) == 3.14
15
+ assert _serialize_value(True) is True
16
+ assert _serialize_value(None) is None
17
+
18
+
19
+ def test_serialize_value_list():
20
+ """Test _serialize_value with lists."""
21
+ result = _serialize_value([1, 2, 3])
22
+ assert result == [1, 2, 3]
23
+
24
+
25
+ def test_serialize_value_list_truncation():
26
+ """Test _serialize_value truncates long lists."""
27
+ long_list = list(range(20))
28
+ result = _serialize_value(long_list, max_items=5)
29
+ assert len(result) == 5
30
+ assert result == [0, 1, 2, 3, 4]
31
+
32
+
33
+ def test_serialize_value_tuple():
34
+ """Test _serialize_value with tuples."""
35
+ result = _serialize_value((1, 2, 3))
36
+ assert result == [1, 2, 3] # Converted to list by JSON
37
+
38
+
39
+ def test_serialize_value_tuple_truncation():
40
+ """Test _serialize_value truncates long tuples."""
41
+ long_tuple = tuple(range(20))
42
+ result = _serialize_value(long_tuple, max_items=5)
43
+ assert len(result) == 5
44
+
45
+
46
+ def test_serialize_value_dict():
47
+ """Test _serialize_value with dicts."""
48
+ result = _serialize_value({"key": "value"})
49
+ assert result == {"key": "value"}
50
+
51
+
52
+ def test_serialize_value_dict_truncation():
53
+ """Test _serialize_value truncates large dicts."""
54
+ large_dict = {f"key{i}": i for i in range(20)}
55
+ result = _serialize_value(large_dict, max_items=5)
56
+ assert len(result) == 5
57
+
58
+
59
+ def test_serialize_value_complex_object():
60
+ """Test _serialize_value with custom objects."""
61
+
62
+ @dataclass
63
+ class CustomObj:
64
+ name: str
65
+ value: int
66
+
67
+ obj = CustomObj(name="test", value=42)
68
+ result = _serialize_value(obj)
69
+ assert isinstance(result, dict)
70
+ assert result["name"] == "test"
71
+ assert result["value"] == 42
72
+
73
+
74
+ def test_serialize_value_fallback():
75
+ """Test _serialize_value fallback for non-serializable objects."""
76
+
77
+ class WeirdObj:
78
+ def __init__(self):
79
+ raise Exception("Can't access")
80
+
81
+ obj = WeirdObj.__new__(WeirdObj)
82
+ result = _serialize_value(obj)
83
+ # The result is a string representation of the object
84
+ assert isinstance(result, str)
85
+ assert "WeirdObj" in result
86
+
87
+
88
+ @pytest.mark.asyncio
89
+ async def test_instrument_async_basic():
90
+ """Test instrument decorator on async function."""
91
+
92
+ @instrument
93
+ async def test_func(x: int, y: int) -> int:
94
+ return x + y
95
+
96
+ result = await test_func(2, 3)
97
+ assert result == 5
98
+
99
+
100
+ @pytest.mark.asyncio
101
+ async def test_instrument_async_with_params():
102
+ """Test instrument with custom parameters."""
103
+
104
+ @instrument(name="custom_name", category="custom_type")
105
+ async def test_func(x: int) -> int:
106
+ return x * 2
107
+
108
+ result = await test_func(5)
109
+ assert result == 10
110
+
111
+
112
+ @pytest.mark.asyncio
113
+ async def test_instrument_async_with_exception():
114
+ """Test instrument handles exceptions."""
115
+
116
+ @instrument
117
+ async def test_func():
118
+ raise ValueError("Test error")
119
+
120
+ with pytest.raises(ValueError, match="Test error"):
121
+ await test_func()
122
+
123
+
124
+ @pytest.mark.asyncio
125
+ async def test_instrument_async_no_record_args():
126
+ """Test instrument with record_args=False."""
127
+
128
+ @instrument(record_args=False)
129
+ async def test_func(x: int) -> int:
130
+ return x
131
+
132
+ result = await test_func(42)
133
+ assert result == 42
134
+
135
+
136
+ @pytest.mark.asyncio
137
+ async def test_instrument_async_no_record_result():
138
+ """Test instrument with record_result=False."""
139
+
140
+ @instrument(record_result=False)
141
+ async def test_func() -> str:
142
+ return "test"
143
+
144
+ result = await test_func()
145
+ assert result == "test"
146
+
147
+
148
+ @pytest.mark.asyncio
149
+ async def test_instrument_async_with_category():
150
+ """Test instrument with custom category."""
151
+
152
+ @instrument(category="agent")
153
+ async def test_func() -> int:
154
+ return 42
155
+
156
+ result = await test_func()
157
+ assert result == 42
158
+
159
+
160
+ def test_instrument_sync_basic():
161
+ """Test instrument decorator on sync function."""
162
+
163
+ @instrument
164
+ def test_func(x: int, y: int) -> int:
165
+ return x + y
166
+
167
+ result = test_func(2, 3)
168
+ assert result == 5
169
+
170
+
171
+ def test_instrument_sync_with_params():
172
+ """Test instrument on sync function with parameters."""
173
+
174
+ @instrument(name="sync_custom", category="sync_type")
175
+ def test_func(x: int) -> int:
176
+ return x * 2
177
+
178
+ result = test_func(5)
179
+ assert result == 10
180
+
181
+
182
+ def test_instrument_sync_with_exception():
183
+ """Test instrument handles exceptions in sync functions."""
184
+
185
+ @instrument
186
+ def test_func():
187
+ raise ValueError("Sync error")
188
+
189
+ with pytest.raises(ValueError, match="Sync error"):
190
+ test_func()
191
+
192
+
193
+ def test_instrument_sync_no_record_args():
194
+ """Test instrument sync with record_args=False."""
195
+
196
+ @instrument(record_args=False)
197
+ def test_func(x: int) -> int:
198
+ return x
199
+
200
+ result = test_func(42)
201
+ assert result == 42
202
+
203
+
204
+ def test_instrument_sync_no_record_result():
205
+ """Test instrument sync with record_result=False."""
206
+
207
+ @instrument(record_result=False)
208
+ def test_func() -> str:
209
+ return "test"
210
+
211
+ result = test_func()
212
+ assert result == "test"
213
+
214
+
215
+ def test_instrument_sync_with_category():
216
+ """Test instrument sync with custom category."""
217
+
218
+ @instrument(category="tool")
219
+ def test_func() -> int:
220
+ return 42
221
+
222
+ result = test_func()
223
+ assert result == 42
224
+
225
+
226
+ def test_instrument_already_instrumented():
227
+ """Test that instrumenting already instrumented function is skipped."""
228
+
229
+ @instrument
230
+ def test_func():
231
+ return "original"
232
+
233
+ # Try to instrument again
234
+ test_func2 = instrument(test_func)
235
+
236
+ # Should be the same function
237
+ assert test_func2 is test_func
238
+
239
+
240
+ def test_instrument_marks_as_instrumented():
241
+ """Test that instrument marks functions correctly."""
242
+
243
+ @instrument
244
+ def test_func():
245
+ return True
246
+
247
+ assert hasattr(test_func, "_hud_instrumented")
248
+ assert test_func._hud_instrumented is True
249
+ assert hasattr(test_func, "_hud_original")
250
+
251
+
252
+ @pytest.mark.asyncio
253
+ async def test_instrument_async_complex_result():
254
+ """Test instrument with complex result object."""
255
+
256
+ @instrument
257
+ async def test_func() -> dict:
258
+ return {"nested": {"data": [1, 2, 3]}, "count": 3}
259
+
260
+ result = await test_func()
261
+ assert result["count"] == 3
262
+
263
+
264
+ def test_instrument_sync_complex_result():
265
+ """Test instrument sync with complex result."""
266
+
267
+ @dataclass
268
+ class Result:
269
+ value: int
270
+ name: str
271
+
272
+ @instrument
273
+ def test_func() -> Result:
274
+ return Result(value=42, name="test")
275
+
276
+ result = test_func()
277
+ assert result.value == 42
278
+
279
+
280
+ @pytest.mark.asyncio
281
+ async def test_instrument_async_with_self_param():
282
+ """Test instrument properly handles 'self' parameter."""
283
+
284
+ class TestClass:
285
+ @instrument
286
+ async def method(self, x: int) -> int:
287
+ return x * 2
288
+
289
+ obj = TestClass()
290
+ result = await obj.method(5)
291
+ assert result == 10
292
+
293
+
294
+ def test_instrument_sync_with_cls_param():
295
+ """Test instrument properly handles 'cls' parameter."""
296
+
297
+ class TestClass:
298
+ @classmethod
299
+ @instrument
300
+ def method(cls, x: int) -> int:
301
+ return x * 3
302
+
303
+ result = TestClass.method(4)
304
+ assert result == 12
305
+
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_instrument_async_serialization_error():
309
+ """Test instrument handles serialization errors gracefully."""
310
+
311
+ class UnserializableArg:
312
+ def __getattribute__(self, name):
313
+ raise Exception("Can't serialize")
314
+
315
+ @instrument
316
+ async def test_func(arg):
317
+ return "success"
318
+
319
+ # Should not raise, just skip serialization
320
+ result = await test_func(UnserializableArg())
321
+ assert result == "success"
322
+
323
+
324
+ def test_instrument_function_without_signature():
325
+ """Test instrument on functions without inspectable signature."""
326
+ # Built-in functions don't have signatures
327
+ instrumented_len = instrument(len)
328
+ result = instrumented_len([1, 2, 3])
329
+ assert result == 3
330
+
331
+
332
+ @pytest.mark.asyncio
333
+ async def test_instrument_async_result_serialization_error():
334
+ """Test instrument handles result serialization errors."""
335
+
336
+ class UnserializableResult:
337
+ def __iter__(self):
338
+ raise Exception("Can't iterate")
339
+
340
+ @instrument
341
+ async def test_func():
342
+ return UnserializableResult()
343
+
344
+ # Should not raise, just skip result recording
345
+ result = await test_func()
346
+ assert isinstance(result, UnserializableResult)
347
+
348
+
349
+ def test_instrument_without_parentheses():
350
+ """Test using @instrument without parentheses."""
351
+
352
+ @instrument
353
+ def test_func(x: int) -> int:
354
+ return x + 1
355
+
356
+ assert test_func(5) == 6
357
+
358
+
359
+ def test_instrument_with_parentheses():
360
+ """Test using @instrument() with parentheses."""
361
+
362
+ @instrument()
363
+ def test_func(x: int) -> int:
364
+ return x + 1
365
+
366
+ assert test_func(5) == 6
367
+
368
+
369
+ @pytest.mark.asyncio
370
+ async def test_instrument_async_with_defaults():
371
+ """Test instrument with function that has default arguments."""
372
+
373
+ @instrument
374
+ async def test_func(x: int, y: int = 10) -> int:
375
+ return x + y
376
+
377
+ assert await test_func(5) == 15
378
+ assert await test_func(5, 20) == 25
379
+
380
+
381
+ def test_instrument_sync_with_kwargs():
382
+ """Test instrument with keyword arguments."""
383
+
384
+ @instrument
385
+ def test_func(x: int, **kwargs) -> dict:
386
+ return {"x": x, **kwargs}
387
+
388
+ result = test_func(1, a=2, b=3)
389
+ assert result == {"x": 1, "a": 2, "b": 3}
390
+
391
+
392
+ @pytest.mark.asyncio
393
+ async def test_instrument_async_with_varargs():
394
+ """Test instrument with *args."""
395
+
396
+ @instrument
397
+ async def test_func(*args) -> int:
398
+ return sum(args)
399
+
400
+ result = await test_func(1, 2, 3, 4)
401
+ assert result == 10
hud/tools/__init__.py CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
+ from .agent import AgentTool
7
8
  from .base import BaseHub, BaseTool
8
9
  from .bash import BashTool
9
10
  from .edit import EditTool
@@ -12,17 +13,26 @@ from .response import ResponseTool
12
13
  from .submit import SubmitTool
13
14
 
14
15
  if TYPE_CHECKING:
15
- from .computer import AnthropicComputerTool, HudComputerTool, OpenAIComputerTool
16
+ from .computer import (
17
+ AnthropicComputerTool,
18
+ GeminiComputerTool,
19
+ HudComputerTool,
20
+ OpenAIComputerTool,
21
+ QwenComputerTool,
22
+ )
16
23
 
17
24
  __all__ = [
25
+ "AgentTool",
18
26
  "AnthropicComputerTool",
19
27
  "BaseHub",
20
28
  "BaseTool",
21
29
  "BashTool",
22
30
  "EditTool",
31
+ "GeminiComputerTool",
23
32
  "HudComputerTool",
24
33
  "OpenAIComputerTool",
25
34
  "PlaywrightTool",
35
+ "QwenComputerTool",
26
36
  "ResponseTool",
27
37
  "SubmitTool",
28
38
  ]
@@ -30,7 +40,13 @@ __all__ = [
30
40
 
31
41
  def __getattr__(name: str) -> Any:
32
42
  """Lazy import computer tools to avoid importing pyautogui unless needed."""
33
- if name in ("AnthropicComputerTool", "HudComputerTool", "OpenAIComputerTool"):
43
+ if name in (
44
+ "AnthropicComputerTool",
45
+ "HudComputerTool",
46
+ "OpenAIComputerTool",
47
+ "GeminiComputerTool",
48
+ "QwenComputerTool",
49
+ ):
34
50
  from . import computer
35
51
 
36
52
  return getattr(computer, name)
hud/tools/agent.py ADDED
@@ -0,0 +1,223 @@
1
+ """AgentTool - run a Task with an agent as a tool."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import inspect
6
+ from typing import TYPE_CHECKING, Any, Union, get_args, get_origin
7
+
8
+ from fastmcp.tools.tool import FunctionTool, ToolResult
9
+ from mcp.types import TextContent
10
+
11
+ from hud.tools.base import BaseTool
12
+
13
+ if TYPE_CHECKING:
14
+ from hud.agents.base import MCPAgent
15
+ from hud.eval.task import Task
16
+
17
+ __all__ = ["AgentTool"]
18
+
19
+
20
+ def _is_eval_only(param: inspect.Parameter) -> bool:
21
+ """Check if param is eval-only: has None default AND None in type union.
22
+
23
+ Handles both runtime types and string annotations (PEP 563).
24
+ """
25
+ # Must have default of None
26
+ if param.default is not None:
27
+ return False
28
+ if param.annotation is inspect.Parameter.empty:
29
+ return False
30
+
31
+ annotation = param.annotation
32
+
33
+ # Handle string annotations (from __future__ annotations or quoted)
34
+ if isinstance(annotation, str):
35
+ # Check if it looks like "X | None", "Union[X, None]", or "Optional[X]"
36
+ return (
37
+ "| None" in annotation
38
+ or "None |" in annotation
39
+ or "Optional[" in annotation
40
+ or ("Union[" in annotation and "None" in annotation)
41
+ )
42
+
43
+ # Handle runtime type annotations
44
+ origin = get_origin(annotation)
45
+
46
+ # Union types (X | None or Union[X, None])
47
+ if origin is Union:
48
+ return type(None) in get_args(annotation)
49
+
50
+ # For Python 3.10+ union syntax at runtime (types.UnionType)
51
+ try:
52
+ import types
53
+
54
+ if isinstance(annotation, types.UnionType):
55
+ return type(None) in get_args(annotation)
56
+ except (ImportError, AttributeError):
57
+ pass
58
+
59
+ return False
60
+
61
+
62
+ class AgentTool(BaseTool):
63
+ """Tool that runs a Task template with an agent.
64
+
65
+ Parameters with `| None = None` are eval-only and hidden from the tool schema.
66
+
67
+ Example:
68
+ ```python
69
+ @env.scenario()
70
+ async def investigate(
71
+ issue_id: str, # Required - orchestrator sees
72
+ expected_cause: str | None = None, # Eval only - hidden
73
+ ):
74
+ yield {"task": f"Investigate {issue_id}"}
75
+
76
+
77
+ seer = AgentTool(env("investigate"), model="ft:seer-v2")
78
+ ```
79
+ """
80
+
81
+ def __init__(
82
+ self,
83
+ task: Task,
84
+ *,
85
+ model: str | None = None,
86
+ agent: type[MCPAgent] | None = None,
87
+ agent_params: dict[str, Any] | None = None,
88
+ name: str | None = None,
89
+ description: str | None = None,
90
+ trace: bool = False,
91
+ ) -> None:
92
+ if not model and agent is None:
93
+ raise ValueError("Must provide either 'model' or 'agent'")
94
+ if model and agent is not None:
95
+ raise ValueError("Cannot provide both 'model' and 'agent'")
96
+
97
+ self._task = task
98
+ self._model = model
99
+ self._agent_cls = agent
100
+ self._agent_params = agent_params or {}
101
+ self._trace = trace
102
+
103
+ # Get visible params from scenario function
104
+ self._visible_params: set[str] = set()
105
+ self._param_schema: dict[str, Any] = {
106
+ "type": "object",
107
+ "properties": {},
108
+ "required": [],
109
+ }
110
+
111
+ if task.env and task.scenario:
112
+ scenario_fn = task.env._scenarios.get(task.scenario)
113
+ if scenario_fn:
114
+ sig = inspect.signature(scenario_fn)
115
+ visible = {name: p for name, p in sig.parameters.items() if not _is_eval_only(p)}
116
+ self._visible_params = set(visible.keys())
117
+ self._param_schema = self._build_schema(visible)
118
+
119
+ tool_name = name or task.scenario or "agent_tool"
120
+ tool_desc = description or f"Run scenario: {task.scenario}"
121
+
122
+ super().__init__(name=tool_name, description=tool_desc)
123
+
124
+ def _build_schema(self, params: dict[str, inspect.Parameter]) -> dict[str, Any]:
125
+ """Build JSON schema using Pydantic TypeAdapter."""
126
+ from pydantic import TypeAdapter
127
+
128
+ properties: dict[str, Any] = {}
129
+ required: list[str] = []
130
+
131
+ for name, param in params.items():
132
+ if param.annotation is not inspect.Parameter.empty:
133
+ try:
134
+ # Handle string annotations
135
+ annotation = param.annotation
136
+ if isinstance(annotation, str):
137
+ # Try to evaluate the annotation
138
+ try:
139
+ annotation = eval(annotation) # noqa: S307
140
+ except Exception:
141
+ # Fall back to string type but don't skip required handling
142
+ annotation = None
143
+
144
+ if annotation is not None:
145
+ adapter = TypeAdapter(annotation)
146
+ properties[name] = adapter.json_schema()
147
+ else:
148
+ properties[name] = {"type": "string"}
149
+ except Exception:
150
+ properties[name] = {"type": "string"}
151
+ else:
152
+ properties[name] = {"type": "string"}
153
+
154
+ if param.default is inspect.Parameter.empty:
155
+ required.append(name)
156
+ elif param.default is not None:
157
+ properties[name]["default"] = param.default
158
+
159
+ return {"type": "object", "properties": properties, "required": required}
160
+
161
+ @property
162
+ def mcp(self) -> FunctionTool:
163
+ """Get as FastMCP FunctionTool with filtered schema."""
164
+ if not hasattr(self, "_mcp_tool"):
165
+ # Directly instantiate FunctionTool with our callable and schema
166
+ # This bypasses from_function's signature parsing
167
+ self._mcp_tool = FunctionTool(
168
+ name=self.name,
169
+ description=self.description or "",
170
+ parameters=self._param_schema,
171
+ fn=self._execute_with_args,
172
+ )
173
+ return self._mcp_tool
174
+
175
+ async def _execute_with_args(self, **kwargs: Any) -> ToolResult:
176
+ """Internal executor that FastMCP calls with parsed arguments."""
177
+ return await self(**kwargs)
178
+
179
+ async def __call__(self, **kwargs: Any) -> ToolResult:
180
+ """Execute the task with a fresh agent."""
181
+ from hud.eval.context import get_current_trace_id
182
+ from hud.eval.manager import run_eval
183
+ from hud.telemetry.instrument import instrument
184
+
185
+ # Filter to visible params only
186
+ filtered = {k: v for k, v in kwargs.items() if k in self._visible_params}
187
+
188
+ # Merge with template args
189
+ base_args = self._task.args or {}
190
+ task = self._task.model_copy(update={"args": {**base_args, **filtered}})
191
+
192
+ # Use parent trace if available (for hierarchical agents)
193
+ parent_trace_id = get_current_trace_id()
194
+
195
+ # If nested (has parent), skip subagent's enter/exit registration
196
+ # Tool calls are still recorded via the shared trace_id's context
197
+ is_nested = parent_trace_id is not None
198
+
199
+ # Trace if explicitly requested AND not nested (nested uses parent trace)
200
+ should_trace = self._trace and not is_nested
201
+
202
+ # Wrap execution with instrumentation to mark as subagent
203
+ # Platform uses category="subagent" to detect and render subagent tool calls
204
+ @instrument(category="subagent", name=self.name)
205
+ async def _run_subagent() -> ToolResult:
206
+ async with run_eval(
207
+ task,
208
+ trace=should_trace,
209
+ trace_id=parent_trace_id,
210
+ quiet=True,
211
+ ) as ctx:
212
+ if self._model:
213
+ from hud.agents import create_agent
214
+
215
+ agent = create_agent(self._model, **self._agent_params)
216
+ else:
217
+ agent = self._agent_cls.create(**self._agent_params) # type: ignore
218
+
219
+ result = await agent.run(ctx)
220
+ content = result.content if hasattr(result, "content") and result.content else ""
221
+ return ToolResult(content=[TextContent(type="text", text=content)])
222
+
223
+ return await _run_subagent()