hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,193 @@
1
+ from __future__ import annotations
2
+
3
+ import pytest
4
+ from mcp.types import ImageContent, TextContent
5
+
6
+ from hud.tools.types import ContentResult, EvaluationResult, ToolError
7
+
8
+
9
+ def test_evaluation_result_defaults():
10
+ """Test EvaluationResult with default values."""
11
+ result = EvaluationResult()
12
+
13
+ assert result.reward == 0.0
14
+ assert result.done is False
15
+ assert result.content is None
16
+ assert result.info == {}
17
+ assert result.isError is False
18
+
19
+
20
+ def test_evaluation_result_with_values():
21
+ """Test EvaluationResult with custom values."""
22
+ result = EvaluationResult(
23
+ reward=0.95,
24
+ done=True,
25
+ content="Task completed successfully",
26
+ info={"steps": 5},
27
+ isError=False,
28
+ )
29
+
30
+ assert result.reward == 0.95
31
+ assert result.done is True
32
+ assert result.content == "Task completed successfully"
33
+ assert result.info == {"steps": 5}
34
+ assert result.isError is False
35
+
36
+
37
+ def test_content_result_defaults():
38
+ """Test ContentResult with default values."""
39
+ result = ContentResult()
40
+
41
+ assert result.output is None
42
+ assert result.error is None
43
+ assert result.base64_image is None
44
+ assert result.system is None
45
+
46
+
47
+ def test_content_result_with_values():
48
+ """Test ContentResult with custom values."""
49
+ result = ContentResult(
50
+ output="Command executed",
51
+ error="No errors",
52
+ base64_image="base64data",
53
+ system="System message",
54
+ )
55
+
56
+ assert result.output == "Command executed"
57
+ assert result.error == "No errors"
58
+ assert result.base64_image == "base64data"
59
+ assert result.system == "System message"
60
+
61
+
62
+ def test_content_result_add_both_output():
63
+ """Test adding two ContentResults with output."""
64
+ result1 = ContentResult(output="Part 1")
65
+ result2 = ContentResult(output=" Part 2")
66
+
67
+ combined = result1 + result2
68
+
69
+ assert combined.output == "Part 1 Part 2"
70
+ assert combined.error is None
71
+ assert combined.base64_image is None
72
+
73
+
74
+ def test_content_result_add_both_error():
75
+ """Test adding two ContentResults with errors."""
76
+ result1 = ContentResult(error="Error 1")
77
+ result2 = ContentResult(error=" Error 2")
78
+
79
+ combined = result1 + result2
80
+
81
+ assert combined.error == "Error 1 Error 2"
82
+ assert combined.output is None
83
+
84
+
85
+ def test_content_result_add_both_system():
86
+ """Test adding two ContentResults with system messages."""
87
+ result1 = ContentResult(system="System 1")
88
+ result2 = ContentResult(system=" System 2")
89
+
90
+ combined = result1 + result2
91
+
92
+ assert combined.system == "System 1 System 2"
93
+
94
+
95
+ def test_content_result_add_one_sided():
96
+ """Test adding ContentResults where only one has values."""
97
+ result1 = ContentResult(output="Output")
98
+ result2 = ContentResult(error="Error")
99
+
100
+ combined = result1 + result2
101
+
102
+ assert combined.output == "Output"
103
+ assert combined.error == "Error"
104
+
105
+
106
+ def test_content_result_add_images_raises_error():
107
+ """Test that combining two results with images raises an error."""
108
+ result1 = ContentResult(base64_image="image1")
109
+ result2 = ContentResult(base64_image="image2")
110
+
111
+ with pytest.raises(ValueError, match="Cannot combine tool results"):
112
+ _ = result1 + result2
113
+
114
+
115
+ def test_content_result_add_one_image():
116
+ """Test adding ContentResults where only one has an image."""
117
+ result1 = ContentResult(base64_image="image1")
118
+ result2 = ContentResult(output="Output")
119
+
120
+ combined = result1 + result2
121
+
122
+ assert combined.base64_image == "image1"
123
+ assert combined.output == "Output"
124
+
125
+
126
+ def test_content_result_to_content_blocks_output():
127
+ """Test converting ContentResult with output to content blocks."""
128
+ result = ContentResult(output="Test output")
129
+
130
+ blocks = result.to_content_blocks()
131
+
132
+ assert len(blocks) == 1
133
+ assert isinstance(blocks[0], TextContent)
134
+ assert blocks[0].text == "Test output"
135
+
136
+
137
+ def test_content_result_to_content_blocks_error():
138
+ """Test converting ContentResult with error to content blocks."""
139
+ result = ContentResult(error="Test error")
140
+
141
+ blocks = result.to_content_blocks()
142
+
143
+ assert len(blocks) == 1
144
+ assert isinstance(blocks[0], TextContent)
145
+ assert blocks[0].text == "Test error"
146
+
147
+
148
+ def test_content_result_to_content_blocks_image():
149
+ """Test converting ContentResult with image to content blocks."""
150
+ result = ContentResult(base64_image="base64data")
151
+
152
+ blocks = result.to_content_blocks()
153
+
154
+ assert len(blocks) == 1
155
+ assert isinstance(blocks[0], ImageContent)
156
+ assert blocks[0].data == "base64data"
157
+ assert blocks[0].mimeType == "image/png"
158
+
159
+
160
+ def test_content_result_to_content_blocks_all():
161
+ """Test converting ContentResult with all fields to content blocks."""
162
+ result = ContentResult(
163
+ output="Output",
164
+ error="Error",
165
+ base64_image="image",
166
+ )
167
+
168
+ blocks = result.to_content_blocks()
169
+
170
+ assert len(blocks) == 3
171
+ assert isinstance(blocks[0], TextContent)
172
+ assert blocks[0].text == "Output"
173
+ assert isinstance(blocks[1], TextContent)
174
+ assert blocks[1].text == "Error"
175
+ assert isinstance(blocks[2], ImageContent)
176
+ assert blocks[2].data == "image"
177
+
178
+
179
+ def test_content_result_to_content_blocks_empty():
180
+ """Test converting empty ContentResult to content blocks."""
181
+ result = ContentResult()
182
+
183
+ blocks = result.to_content_blocks()
184
+
185
+ assert len(blocks) == 0
186
+
187
+
188
+ def test_tool_error():
189
+ """Test ToolError exception."""
190
+ error = ToolError("Test error message")
191
+
192
+ assert isinstance(error, Exception)
193
+ assert str(error) == "Test error message"
hud/tools/types.py CHANGED
@@ -6,6 +6,18 @@ from mcp.types import ContentBlock, ImageContent, TextContent
6
6
  from pydantic import BaseModel, ConfigDict, Field
7
7
 
8
8
 
9
+ class Coordinate(BaseModel):
10
+ """A coordinate point with x and y values.
11
+
12
+ Used for path-based actions like drag operations.
13
+ """
14
+
15
+ model_config = ConfigDict(extra="forbid")
16
+
17
+ x: int = Field(..., description="X coordinate")
18
+ y: int = Field(..., description="Y coordinate")
19
+
20
+
9
21
  class EvaluationResult(BaseModel):
10
22
  """Standard evaluation result format."""
11
23
 
@@ -28,6 +40,7 @@ class ContentResult(BaseModel):
28
40
  error: str | None = Field(default=None, description="Error message")
29
41
  base64_image: str | None = Field(default=None, description="Base64-encoded image")
30
42
  system: str | None = Field(default=None, description="System message")
43
+ url: str | None = Field(default=None, description="Current page URL (for browser automation)")
31
44
 
32
45
  def __add__(self, other: ContentResult) -> ContentResult:
33
46
  def combine_fields(
@@ -44,6 +57,7 @@ class ContentResult(BaseModel):
44
57
  error=combine_fields(self.error, other.error),
45
58
  base64_image=combine_fields(self.base64_image, other.base64_image, False),
46
59
  system=combine_fields(self.system, other.system),
60
+ url=combine_fields(self.url, other.url, False),
47
61
  )
48
62
 
49
63
  def to_content_blocks(self) -> list[ContentBlock]:
@@ -55,7 +69,7 @@ class ContentResult(BaseModel):
55
69
  result: ContentResult to convert
56
70
 
57
71
  Returns:
58
- List of ContentBlock
72
+ List of ContentBlock with URL embedded as metadata if available
59
73
  """
60
74
  blocks: list[ContentBlock] = []
61
75
 
@@ -65,6 +79,12 @@ class ContentResult(BaseModel):
65
79
  blocks.append(TextContent(text=self.error, type="text"))
66
80
  if self.base64_image:
67
81
  blocks.append(ImageContent(data=self.base64_image, mimeType="image/png", type="image"))
82
+
83
+ # Add URL as a special metadata text block (for Gemini Computer Use)
84
+ # Always include URL if set, even if it's a placeholder like "about:blank"
85
+ if self.url:
86
+ blocks.append(TextContent(text=f"__URL__:{self.url}", type="text"))
87
+
68
88
  return blocks
69
89
 
70
90
 
hud/types.py CHANGED
@@ -1,34 +1,116 @@
1
1
  from __future__ import annotations
2
2
 
3
- import contextlib
4
3
  import json
5
4
  import logging
6
5
  import uuid
7
- from collections import defaultdict
8
- from string import Template
6
+ from enum import Enum
9
7
  from typing import Any, Literal
10
8
 
11
9
  import mcp.types as types
12
10
  from mcp.types import CallToolRequestParams, CallToolResult
13
- from pydantic import BaseModel, ConfigDict, Field, field_validator
11
+ from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator
14
12
 
15
13
  from hud.settings import settings
14
+ from hud.utils.env import resolve_env_vars as _resolve_env_vars
16
15
  from hud.utils.tool_shorthand import normalize_to_tool_call_dict
17
16
 
18
17
  logger = logging.getLogger(__name__)
19
18
 
19
+ # Guard to ensure we only log missing HUD_API_KEY once
20
+ _missing_api_key_error_logged: bool = False
20
21
 
21
- class Task(BaseModel):
22
+
23
+ class AgentType(str, Enum):
24
+ CLAUDE = "claude"
25
+ OPENAI = "openai"
26
+ OPERATOR = "operator"
27
+ GEMINI = "gemini"
28
+ GEMINI_CUA = "gemini_cua"
29
+ OPENAI_COMPATIBLE = "openai_compatible"
30
+ INTEGRATION_TEST = "integration_test"
31
+
32
+ @property
33
+ def cls(self) -> type:
34
+ from hud.agents import OpenAIAgent, OperatorAgent
35
+ from hud.agents.claude import ClaudeAgent
36
+ from hud.agents.gemini import GeminiAgent
37
+ from hud.agents.gemini_cua import GeminiCUAAgent
38
+ from hud.agents.openai_chat import OpenAIChatAgent
39
+
40
+ mapping: dict[AgentType, type] = {
41
+ AgentType.CLAUDE: ClaudeAgent,
42
+ AgentType.OPENAI: OpenAIAgent,
43
+ AgentType.OPERATOR: OperatorAgent,
44
+ AgentType.GEMINI: GeminiAgent,
45
+ AgentType.GEMINI_CUA: GeminiCUAAgent,
46
+ AgentType.OPENAI_COMPATIBLE: OpenAIChatAgent,
47
+ }
48
+ if self == AgentType.INTEGRATION_TEST:
49
+ from hud.agents.misc.integration_test_agent import IntegrationTestRunner
50
+
51
+ return IntegrationTestRunner
52
+ if self not in mapping:
53
+ raise ValueError(f"Unsupported agent type: {self}")
54
+ return mapping[self]
55
+
56
+
57
+ class BaseAgentConfig(BaseModel):
58
+ """Agent configuration for LLM-specific settings.
59
+
60
+ Note: allowed_tools, disallowed_tools, append_setup_output, and initial_screenshot
61
+ are kept for backwards compatibility with v4 task configs but are no longer applied
62
+ at the agent level. These should be configured on the Environment/Task instead.
22
63
  """
64
+
65
+ model_config = ConfigDict(arbitrary_types_allowed=True, extra="forbid", populate_by_name=True)
66
+
67
+ # Model identifier - use 'model' (preferred) or 'checkpoint_name' (alias)
68
+ model: str | None = Field(
69
+ default=None, validation_alias=AliasChoices("model", "checkpoint_name")
70
+ )
71
+ model_name: str = "Agent" # Human-readable display name
72
+
73
+ # LLM-specific setting
74
+ system_prompt: str | None = None
75
+
76
+ # Deprecated: kept for backwards compat with v4 task configs, not applied by agent
77
+ allowed_tools: list[str] | None = None
78
+ disallowed_tools: list[str] | None = None
79
+ append_setup_output: bool = True
80
+ append_setup_tool: bool = True # Alias for append_setup_output (backwards compat)
81
+ initial_screenshot: bool = True
82
+
83
+ @property
84
+ def checkpoint_name(self) -> str | None:
85
+ """Alias for model (for backwards compatibility)."""
86
+ return self.model
87
+
88
+
89
+ class LegacyTask(BaseModel):
90
+ """
91
+ DEPRECATED: Use Task from env() instead.
92
+
23
93
  A task configuration that can be used to create a task.
24
94
 
25
95
  The mcp_config field supports environment variable substitution using
26
96
  template placeholders in the format ${VAR_NAME} or ${VAR_NAME:default_value}.
27
97
 
28
- Example:
98
+ .. deprecated:: 0.5.0
99
+ LegacyTask is deprecated in v0.5.0 and will be removed in v0.6.0
100
+ (no earlier than March 1st, 2026).
101
+
102
+ Use one of these migration paths:
103
+
104
+ 1. Quick conversion: ``Task.from_v4(legacy_task)`` converts LegacyTask to Task
105
+ 2. Full migration: Use ``@env.scenario()`` with setup code before first yield
106
+ and evaluate code after first yield
107
+
108
+ See https://docs.hud.ai/migration for the full migration guide.
109
+
110
+ Example (deprecated):
29
111
  mcp_config: {
30
112
  "hud": {
31
- "url": "${HUD_MCP_URL:https://mcp.hud.so/v3/mcp}",
113
+ "url": "${HUD_MCP_URL:https://mcp.hud.ai/v3/mcp}",
32
114
  "headers": {
33
115
  "Authorization": "Bearer ${HUD_API_KEY}",
34
116
  "Mcp-Image": "your-mcp-image"
@@ -43,10 +125,23 @@ class Task(BaseModel):
43
125
  setup_tool: MCPToolCall | list[MCPToolCall] | None = None
44
126
  evaluate_tool: MCPToolCall | list[MCPToolCall] | None = None
45
127
  integration_test_tool: MCPToolCall | list[MCPToolCall] | None = None
46
- agent_tools: list[str] | None = None
47
- system_prompt: str | None = None
128
+ agent_config: BaseAgentConfig | None = None
48
129
  metadata: dict[str, Any] = Field(default_factory=dict)
49
130
 
131
+ def __init__(self, **data: Any) -> None:
132
+ """Initialize LegacyTask with deprecation warning."""
133
+ import warnings
134
+
135
+ warnings.warn(
136
+ "LegacyTask is deprecated in v0.5.0 and will be removed in v0.6.0 "
137
+ "(no earlier than March 1st, 2026). "
138
+ "Use Task.from_v4() for quick conversion, or migrate to @env.scenario(). "
139
+ "See https://docs.hud.ai/migration for details.",
140
+ DeprecationWarning,
141
+ stacklevel=2,
142
+ )
143
+ super().__init__(**data)
144
+
50
145
  @field_validator("mcp_config", "metadata", mode="before")
51
146
  @classmethod
52
147
  def parse_json_strings(cls, v: Any) -> Any:
@@ -60,6 +155,25 @@ class Task(BaseModel):
60
155
  raise HudConfigError(f"Invalid JSON string: {e}") from e
61
156
  return v
62
157
 
158
+ @field_validator("agent_config", mode="before")
159
+ @classmethod
160
+ def parse_agent_config(cls, v: Any) -> BaseAgentConfig | None:
161
+ """Parse agent_config into BaseAgentConfig."""
162
+ if v is None:
163
+ return None
164
+ if isinstance(v, BaseAgentConfig):
165
+ return v
166
+ if isinstance(v, str):
167
+ try:
168
+ v = json.loads(v)
169
+ except json.JSONDecodeError as e:
170
+ from hud.shared.exceptions import HudConfigError
171
+
172
+ raise HudConfigError(f"Invalid JSON string for agent_config: {e}") from e
173
+ if isinstance(v, dict):
174
+ return BaseAgentConfig(**v)
175
+ return v
176
+
63
177
  @field_validator("setup_tool", "evaluate_tool", "integration_test_tool", mode="before")
64
178
  @classmethod
65
179
  def convert_dict_to_tool_call(cls, v: Any, info: Any) -> Any:
@@ -98,44 +212,21 @@ class Task(BaseModel):
98
212
  @classmethod
99
213
  def resolve_env_vars(cls, v: dict[str, Any]) -> dict[str, Any]:
100
214
  """
101
- Automatically resolve environment variables in mcp_config using Template.
215
+ Automatically resolve environment variables in mcp_config.
102
216
 
103
217
  Supports ${VAR_NAME} syntax with variable substitution from
104
- System environment variables (including HUD_API_KEY, etc.)
218
+ system environment variables and settings (including HUD_API_KEY, etc.)
105
219
 
106
220
  Missing variables resolve to empty strings.
107
221
  """
108
- import os
109
-
110
- # Start with current environment variables
111
- mapping = dict(os.environ)
112
- # Include settings (from process env, project .env, and user .env)
113
- settings_dict = settings.model_dump()
114
- mapping.update(settings_dict)
115
- # Add UPPERCASE aliases for settings keys
116
- for _key, _val in settings_dict.items():
117
- with contextlib.suppress(Exception):
118
- mapping[_key.upper()] = _val
119
-
120
- if settings.api_key:
121
- mapping["HUD_API_KEY"] = settings.api_key
122
- else:
123
- logger.error("HUD_API_KEY is not set, tracing and remote training will not work")
124
-
125
- def substitute_in_value(obj: Any) -> Any:
126
- """Recursively substitute variables in nested structures."""
127
- if isinstance(obj, str):
128
- # Use Template's substitute with defaultdict - missing vars become empty strings
129
- safe_mapping = defaultdict(str, mapping)
130
- return Template(obj).substitute(safe_mapping)
131
- elif isinstance(obj, dict):
132
- return {k: substitute_in_value(v) for k, v in obj.items()}
133
- elif isinstance(obj, list):
134
- return [substitute_in_value(item) for item in obj]
135
- else:
136
- return obj
222
+ # Warn once if HUD_API_KEY is not set
223
+ if not settings.api_key:
224
+ global _missing_api_key_error_logged
225
+ if not _missing_api_key_error_logged:
226
+ logger.error("HUD_API_KEY is not set, tracing and remote training will not work")
227
+ _missing_api_key_error_logged = True
137
228
 
138
- return substitute_in_value(v)
229
+ return _resolve_env_vars(v)
139
230
 
140
231
 
141
232
  class MCPToolCall(CallToolRequestParams):
@@ -164,7 +255,9 @@ class MCPToolCall(CallToolRequestParams):
164
255
 
165
256
 
166
257
  class MCPToolResult(CallToolResult):
167
- """A tool result."""
258
+ """A tool result with optional call_id for correlation."""
259
+
260
+ call_id: str | None = None # For correlating with provider-specific tool call IDs
168
261
 
169
262
  def _get_content_summary(self) -> str:
170
263
  """Extract a summary of the content."""
@@ -216,7 +309,7 @@ class AgentResponse(BaseModel):
216
309
  tool_calls: list[MCPToolCall] = Field(default_factory=list)
217
310
  done: bool = Field(default=False)
218
311
 
219
- # --- TELEMETRY [hud.so] ---
312
+ # --- TELEMETRY [hud.ai] ---
220
313
  # Responses
221
314
  content: str | None = Field(default=None)
222
315
  reasoning: str | None = Field(default=None)
@@ -267,6 +360,27 @@ class TraceStep(BaseModel):
267
360
  model_config = ConfigDict(populate_by_name=True, extra="allow")
268
361
 
269
362
 
363
+ class HudSpan(BaseModel):
364
+ """A telemetry span ready for export to HUD API."""
365
+
366
+ name: str
367
+ trace_id: str = Field(pattern=r"^[0-9a-fA-F]{32}$")
368
+ span_id: str = Field(pattern=r"^[0-9a-fA-F]{16}$")
369
+ parent_span_id: str | None = Field(default=None, pattern=r"^[0-9a-fA-F]{16}$")
370
+
371
+ start_time: str # ISO format
372
+ end_time: str # ISO format
373
+
374
+ status_code: str # "UNSET", "OK", "ERROR"
375
+ status_message: str | None = None
376
+
377
+ attributes: TraceStep
378
+ exceptions: list[dict[str, Any]] | None = None
379
+ internal_type: str | None = None
380
+
381
+ model_config = ConfigDict(extra="forbid")
382
+
383
+
270
384
  class Trace(BaseModel):
271
385
  """Unified result from agent execution (task or prompt).
272
386
 
@@ -286,7 +400,7 @@ class Trace(BaseModel):
286
400
  isError: bool = Field(default=False)
287
401
 
288
402
  # Metadata
289
- task: Task | None = Field(default=None)
403
+ task: LegacyTask | None = Field(default=None)
290
404
 
291
405
  # Trace
292
406
  trace: list[TraceStep] = Field(default_factory=list)
@@ -302,26 +416,22 @@ class Trace(BaseModel):
302
416
  def append(self, step: TraceStep) -> None:
303
417
  self.trace.append(step)
304
418
 
305
- def populate_from_context(self) -> None:
306
- """Populate trace steps from the current trace context if available.
307
-
308
- This checks if we're executing within a hud.trace() context and
309
- automatically populates the trace field with collected steps.
310
- """
311
- from hud.otel.context import get_current_task_run_id
312
- from hud.telemetry.replay import get_trace
313
419
 
314
- task_run_id = get_current_task_run_id()
315
- if task_run_id:
316
- collected_trace = get_trace(task_run_id)
317
- if collected_trace:
318
- self.trace = collected_trace.trace
420
+ # Re-export Task for backwards compatibility (after module defs to avoid circular import)
421
+ from hud.eval.task import Task # noqa: E402
319
422
 
423
+ # Type alias for functions that accept v5 Task, v4 LegacyTask, or raw dicts
424
+ TaskInput = Task | LegacyTask | dict[str, Any]
320
425
 
321
426
  __all__ = [
322
427
  "AgentResponse",
428
+ "AgentType",
429
+ "HudSpan",
430
+ "LegacyTask",
323
431
  "MCPToolCall",
324
432
  "MCPToolResult",
433
+ "Task",
434
+ "TaskInput",
325
435
  "Trace",
326
436
  "TraceStep",
327
437
  ]
hud/utils/__init__.py CHANGED
@@ -2,9 +2,11 @@ from __future__ import annotations
2
2
 
3
3
  from .hud_console import HUDConsole, hud_console
4
4
  from .telemetry import stream
5
+ from .types import with_signature
5
6
 
6
7
  __all__ = [
7
8
  "HUDConsole",
8
9
  "hud_console",
9
10
  "stream",
11
+ "with_signature",
10
12
  ]
hud/utils/env.py ADDED
@@ -0,0 +1,67 @@
1
+ """Environment variable resolution utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import os
7
+ from collections import defaultdict
8
+ from string import Template
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ from hud.settings import settings
12
+
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Mapping
15
+
16
+
17
+ def resolve_env_vars(obj: Any, extra_mapping: Mapping[str, Any] | None = None) -> Any:
18
+ """Recursively resolve ${VAR_NAME} placeholders in strings.
19
+
20
+ Uses Python's string.Template for substitution. Sources values from:
21
+ 1. os.environ
22
+ 2. hud.settings (loads from project .env and ~/.hud/.env)
23
+ 3. Optional extra_mapping parameter
24
+
25
+ Uppercase aliases are automatically added for settings keys,
26
+ so both ${api_key} and ${API_KEY} work.
27
+
28
+ Missing variables resolve to empty strings.
29
+
30
+ Args:
31
+ obj: The object to resolve (string, dict, list, or other).
32
+ extra_mapping: Optional additional key-value pairs to include.
33
+
34
+ Returns:
35
+ The object with all ${VAR_NAME} placeholders resolved.
36
+
37
+ Example:
38
+ >>> resolve_env_vars({"key": "${MY_VAR}"})
39
+ {'key': 'resolved_value'}
40
+ """
41
+ # Build mapping from environment and settings
42
+ mapping: dict[str, Any] = dict(os.environ)
43
+ settings_dict = settings.model_dump()
44
+ mapping.update(settings_dict)
45
+
46
+ # Add UPPERCASE aliases for settings keys
47
+ for key, val in settings_dict.items():
48
+ with contextlib.suppress(Exception):
49
+ mapping[key.upper()] = val
50
+
51
+ if settings.api_key:
52
+ mapping["HUD_API_KEY"] = settings.api_key
53
+
54
+ if extra_mapping:
55
+ mapping.update(extra_mapping)
56
+
57
+ def substitute(value: Any) -> Any:
58
+ if isinstance(value, str):
59
+ safe_mapping = defaultdict(str, mapping)
60
+ return Template(value).substitute(safe_mapping)
61
+ elif isinstance(value, dict):
62
+ return {k: substitute(v) for k, v in value.items()}
63
+ elif isinstance(value, list):
64
+ return [substitute(item) for item in value]
65
+ return value
66
+
67
+ return substitute(obj)