hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,215 @@
1
+ """Tool format parsing and conversion for OpenAI, Claude, Gemini, and MCP."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from enum import Enum, auto
7
+ from typing import Any
8
+
9
+ from hud.types import MCPToolCall, MCPToolResult
10
+
11
+ __all__ = [
12
+ "ToolFormat",
13
+ "format_result",
14
+ "parse_tool_call",
15
+ "parse_tool_calls",
16
+ "result_to_string",
17
+ ]
18
+
19
+
20
+ class ToolFormat(Enum):
21
+ """Detected tool call format."""
22
+
23
+ OPENAI = auto() # function.arguments as JSON string
24
+ CLAUDE = auto() # type="tool_use", input as dict
25
+ GEMINI = auto() # functionCall with args
26
+ MCP = auto() # name + arguments
27
+
28
+
29
+ # -----------------------------------------------------------------------------
30
+ # Parsing
31
+ # -----------------------------------------------------------------------------
32
+
33
+
34
+ def _to_dict(obj: Any) -> dict[str, Any]:
35
+ """Convert object to dict for uniform processing."""
36
+ if isinstance(obj, dict):
37
+ return obj
38
+ if hasattr(obj, "model_dump"):
39
+ return obj.model_dump()
40
+ if hasattr(obj, "__dict__"):
41
+ return vars(obj)
42
+ raise ValueError(f"Cannot convert {type(obj).__name__} to dict")
43
+
44
+
45
+ def _parse_json_args(args: Any) -> dict[str, Any]:
46
+ """Parse arguments, handling JSON strings."""
47
+ if not args:
48
+ return {}
49
+ if isinstance(args, str):
50
+ try:
51
+ return json.loads(args)
52
+ except json.JSONDecodeError:
53
+ return {}
54
+ return args
55
+
56
+
57
+ def parse_tool_call(call: Any, **kwargs: Any) -> tuple[MCPToolCall, ToolFormat]:
58
+ """Parse any tool call format into (MCPToolCall, ToolFormat).
59
+
60
+ Supports:
61
+ - String (tool name only, or with kwargs)
62
+ - Tuple: (name,), (name, args), (name, args, id)
63
+ - MCPToolCall
64
+ - OpenAI: {function: {name, arguments}, id}
65
+ - Claude: {type: "tool_use", name, input, id}
66
+ - Gemini: {functionCall: {name, args}} or {name, args}
67
+ - Generic: {name, arguments}
68
+
69
+ Args:
70
+ call: Tool call in any supported format.
71
+ **kwargs: Additional arguments (merged when call is a string).
72
+
73
+ Returns:
74
+ Tuple of (MCPToolCall, ToolFormat) for the parsed call.
75
+
76
+ Raises:
77
+ ValueError: If format is unrecognized.
78
+ """
79
+ # Primitives
80
+ if isinstance(call, str):
81
+ return MCPToolCall(name=call, arguments=kwargs or {}), ToolFormat.MCP
82
+
83
+ if isinstance(call, tuple):
84
+ tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
85
+ if len(call) > 2:
86
+ tc.id = call[2]
87
+ return tc, ToolFormat.MCP
88
+
89
+ if isinstance(call, MCPToolCall):
90
+ return call, ToolFormat.MCP
91
+
92
+ # Convert to dict
93
+ d = _to_dict(call)
94
+
95
+ # OpenAI: {function: {name, arguments}, id}
96
+ if "function" in d:
97
+ f = _to_dict(d["function"]) if not isinstance(d["function"], dict) else d["function"]
98
+ tc = MCPToolCall(name=f["name"], arguments=_parse_json_args(f.get("arguments")))
99
+ if d.get("id"):
100
+ tc.id = d["id"]
101
+ return tc, ToolFormat.OPENAI
102
+
103
+ # Claude: {type: "tool_use", name, input, id}
104
+ if d.get("type") == "tool_use":
105
+ tc = MCPToolCall(name=d["name"], arguments=d.get("input") or {})
106
+ if d.get("id"):
107
+ tc.id = d["id"]
108
+ return tc, ToolFormat.CLAUDE
109
+
110
+ # Gemini: {functionCall: {name, args}} or {name, args}
111
+ if "functionCall" in d:
112
+ fc = d["functionCall"]
113
+ return MCPToolCall(name=fc["name"], arguments=fc.get("args") or {}), ToolFormat.GEMINI
114
+
115
+ if "args" in d and "name" in d and "arguments" not in d:
116
+ return MCPToolCall(name=d["name"], arguments=d.get("args") or {}), ToolFormat.GEMINI
117
+
118
+ # Generic: {name, arguments/input}
119
+ if "name" in d:
120
+ tc = MCPToolCall(name=d["name"], arguments=d.get("arguments") or d.get("input") or {})
121
+ if d.get("id"):
122
+ tc.id = d["id"]
123
+ return tc, ToolFormat.MCP
124
+
125
+ raise ValueError(f"Unrecognized tool call format: {list(d.keys())}")
126
+
127
+
128
+ def _is_tool_block(item: Any) -> bool:
129
+ """Check if item is a tool call (not text/other content)."""
130
+ t = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
131
+ return t is None or t in ("tool_use", "function")
132
+
133
+
134
+ def parse_tool_calls(calls: Any) -> list[tuple[MCPToolCall, ToolFormat]]:
135
+ """Parse multiple tool calls, filtering non-tool content (e.g. Claude TextBlock).
136
+
137
+ Args:
138
+ calls: Single call or list of calls in any format.
139
+
140
+ Returns:
141
+ List of (MCPToolCall, ToolFormat) tuples.
142
+ """
143
+ if calls is None:
144
+ return []
145
+ if not isinstance(calls, list):
146
+ try:
147
+ return [parse_tool_call(calls)]
148
+ except ValueError:
149
+ return []
150
+
151
+ results = []
152
+ for item in calls:
153
+ if not _is_tool_block(item):
154
+ continue
155
+ try:
156
+ results.append(parse_tool_call(item))
157
+ except ValueError:
158
+ continue
159
+ return results
160
+
161
+
162
+ # -----------------------------------------------------------------------------
163
+ # Result Formatting
164
+ # -----------------------------------------------------------------------------
165
+
166
+
167
+ def result_to_string(result: MCPToolResult) -> str:
168
+ """Convert MCPToolResult content to string.
169
+
170
+ Args:
171
+ result: MCP tool result with content blocks.
172
+
173
+ Returns:
174
+ String representation of the result content.
175
+ """
176
+ if not result.content:
177
+ return ""
178
+ parts = []
179
+ for block in result.content:
180
+ if (text := getattr(block, "text", None)) is not None:
181
+ parts.append(str(text))
182
+ elif (data := getattr(block, "data", None)) is not None:
183
+ parts.append(f"[binary: {len(data)} bytes]")
184
+ return "\n".join(parts)
185
+
186
+
187
+ def format_result(result: MCPToolResult, tc: MCPToolCall, fmt: ToolFormat) -> Any:
188
+ """Format MCPToolResult based on the input format.
189
+
190
+ Args:
191
+ result: MCP tool result.
192
+ tc: Original tool call (for id/name).
193
+ fmt: Target format.
194
+
195
+ Returns:
196
+ OpenAI: {"role": "tool", "tool_call_id": ..., "content": ...}
197
+ Claude: {"type": "tool_result", "tool_use_id": ..., "content": ..., "is_error"?: bool}
198
+ Gemini: {"functionResponse": {"name": ..., "response": {"result": ...}}}
199
+ MCP: MCPToolResult unchanged
200
+ """
201
+ content = result_to_string(result)
202
+
203
+ if fmt == ToolFormat.OPENAI:
204
+ return {"role": "tool", "tool_call_id": tc.id, "content": content}
205
+
206
+ if fmt == ToolFormat.CLAUDE:
207
+ r: dict[str, Any] = {"type": "tool_result", "tool_use_id": tc.id, "content": content}
208
+ if result.isError:
209
+ r["is_error"] = True
210
+ return r
211
+
212
+ if fmt == ToolFormat.GEMINI:
213
+ return {"functionResponse": {"name": tc.name, "response": {"result": content}}}
214
+
215
+ return result # MCP format - return as-is
@@ -0,0 +1,171 @@
1
+ """Schema utilities for tool definitions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any
7
+
8
+ __all__ = [
9
+ "ensure_strict_schema",
10
+ "json_type_to_python",
11
+ "schema_to_pydantic",
12
+ "validate_openai_schema",
13
+ ]
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def ensure_strict_schema(schema: dict[str, Any]) -> dict[str, Any]:
19
+ """Ensure a JSON schema is compatible with OpenAI's strict mode.
20
+
21
+ OpenAI strict mode requires:
22
+ - additionalProperties: false on all objects
23
+ - All properties must be in required
24
+
25
+ Args:
26
+ schema: Original JSON schema.
27
+
28
+ Returns:
29
+ Modified schema for strict mode.
30
+ """
31
+ schema = dict(schema)
32
+
33
+ if schema.get("type") == "object":
34
+ schema["additionalProperties"] = False
35
+
36
+ if "properties" in schema:
37
+ # All properties must be required
38
+ schema["required"] = list(schema["properties"].keys())
39
+
40
+ # Recursively process nested objects
41
+ for prop_schema in schema["properties"].values():
42
+ if isinstance(prop_schema, dict):
43
+ _ensure_strict_recursive(prop_schema)
44
+
45
+ return schema
46
+
47
+
48
+ def _ensure_strict_recursive(schema: dict[str, Any]) -> None:
49
+ """Recursively apply strict mode to nested schemas."""
50
+ if schema.get("type") == "object":
51
+ schema["additionalProperties"] = False
52
+ if "properties" in schema:
53
+ schema["required"] = list(schema["properties"].keys())
54
+ for prop_schema in schema["properties"].values():
55
+ if isinstance(prop_schema, dict):
56
+ _ensure_strict_recursive(prop_schema)
57
+
58
+ elif schema.get("type") == "array" and "items" in schema:
59
+ if isinstance(schema["items"], dict):
60
+ _ensure_strict_recursive(schema["items"])
61
+
62
+
63
+ def schema_to_pydantic(name: str, schema: dict[str, Any]) -> type:
64
+ """Convert JSON schema to a Pydantic model.
65
+
66
+ Args:
67
+ name: Model name (used for class name).
68
+ schema: JSON schema with properties.
69
+
70
+ Returns:
71
+ Dynamically created Pydantic model class.
72
+ """
73
+ from pydantic import Field, create_model
74
+
75
+ properties = schema.get("properties", {})
76
+ required = set(schema.get("required", []))
77
+
78
+ fields = {}
79
+ for prop_name, prop_schema in properties.items():
80
+ prop_type = json_type_to_python(prop_schema.get("type", "string"))
81
+ default = ... if prop_name in required else None
82
+ description = prop_schema.get("description", "")
83
+ fields[prop_name] = (prop_type, Field(default=default, description=description))
84
+
85
+ return create_model(f"{name}Input", **fields)
86
+
87
+
88
+ def json_type_to_python(json_type: str) -> type:
89
+ """Map JSON schema type to Python type.
90
+
91
+ Args:
92
+ json_type: JSON schema type string.
93
+
94
+ Returns:
95
+ Corresponding Python type.
96
+ """
97
+ mapping = {
98
+ "string": str,
99
+ "integer": int,
100
+ "number": float,
101
+ "boolean": bool,
102
+ "array": list,
103
+ "object": dict,
104
+ }
105
+ return mapping.get(json_type, str)
106
+
107
+
108
+ def validate_openai_schema(
109
+ schema: dict[str, Any],
110
+ tool_name: str = "unknown",
111
+ path: str = "",
112
+ ) -> list[str]:
113
+ """Validate a JSON schema for OpenAI API compatibility.
114
+
115
+ OpenAI's API has specific requirements for tool schemas:
116
+ - Arrays must have 'items' (not 'prefixItems' which tuples generate)
117
+ - Certain schema features like 'prefixItems' are not supported
118
+
119
+ Args:
120
+ schema: JSON schema to validate.
121
+ tool_name: Name of the tool (for error messages).
122
+ path: Current path in schema (for error context).
123
+
124
+ Returns:
125
+ List of validation error messages. Empty if valid.
126
+ """
127
+ errors: list[str] = []
128
+
129
+ if not isinstance(schema, dict):
130
+ return errors
131
+
132
+ # Check for prefixItems (generated by tuple types)
133
+ if "prefixItems" in schema:
134
+ errors.append(
135
+ f"Tool '{tool_name}' has 'prefixItems' at {path or 'root'} "
136
+ "(likely from tuple type). Use list[Model] instead of tuple."
137
+ )
138
+
139
+ # Check arrays have 'items'
140
+ if schema.get("type") == "array" and "items" not in schema and "prefixItems" not in schema:
141
+ errors.append(
142
+ f"Tool '{tool_name}' has array at {path or 'root'} without 'items'. "
143
+ "OpenAI requires 'items' for array schemas."
144
+ )
145
+
146
+ # Recursively check nested schemas
147
+ # Check properties
148
+ if "properties" in schema:
149
+ for prop_name, prop_schema in schema["properties"].items():
150
+ prop_path = f"{path}.{prop_name}" if path else prop_name
151
+ errors.extend(validate_openai_schema(prop_schema, tool_name, prop_path))
152
+
153
+ # Check items
154
+ if "items" in schema and isinstance(schema["items"], dict):
155
+ items_path = f"{path}[items]" if path else "[items]"
156
+ errors.extend(validate_openai_schema(schema["items"], tool_name, items_path))
157
+
158
+ # Check anyOf/oneOf/allOf
159
+ for key in ("anyOf", "oneOf", "allOf"):
160
+ if key in schema:
161
+ for i, sub_schema in enumerate(schema[key]):
162
+ sub_path = f"{path}.{key}[{i}]" if path else f"{key}[{i}]"
163
+ errors.extend(validate_openai_schema(sub_schema, tool_name, sub_path))
164
+
165
+ # Check $defs (definitions)
166
+ if "$defs" in schema:
167
+ for def_name, def_schema in schema["$defs"].items():
168
+ def_path = f"$defs.{def_name}"
169
+ errors.extend(validate_openai_schema(def_schema, tool_name, def_path))
170
+
171
+ return errors
@@ -0,0 +1,113 @@
1
+ """Shared tool wrapper utilities for agent framework integrations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ if TYPE_CHECKING:
9
+ from collections.abc import Callable
10
+
11
+ import mcp.types as mcp_types
12
+
13
+ __all__ = [
14
+ "create_async_tool_fn",
15
+ "create_sync_tool_fn",
16
+ "create_tool_fns",
17
+ "stringify_result",
18
+ ]
19
+
20
+
21
+ def stringify_result(result: Any) -> str:
22
+ """Convert a tool result to string format.
23
+
24
+ Args:
25
+ result: The tool result (str, dict, or other).
26
+
27
+ Returns:
28
+ String representation of the result.
29
+ """
30
+ if isinstance(result, str):
31
+ return result
32
+ return json.dumps(result) if result else ""
33
+
34
+
35
+ def create_async_tool_fn(
36
+ env: Any,
37
+ tool_name: str,
38
+ description: str | None = None,
39
+ ) -> Callable[..., Any]:
40
+ """Create an async function that calls a tool on the environment.
41
+
42
+ Args:
43
+ env: Environment with call_tool method.
44
+ tool_name: Name of the tool to call.
45
+ description: Optional description for the function docstring.
46
+
47
+ Returns:
48
+ Async function that calls the tool and returns string result.
49
+ """
50
+
51
+ async def async_fn(**kwargs: Any) -> str:
52
+ result = await env.call_tool(tool_name, **kwargs)
53
+ return stringify_result(result)
54
+
55
+ async_fn.__name__ = tool_name
56
+ async_fn.__doc__ = description or f"Tool: {tool_name}"
57
+ return async_fn
58
+
59
+
60
+ def create_sync_tool_fn(
61
+ env: Any,
62
+ tool_name: str,
63
+ description: str | None = None,
64
+ ) -> Callable[..., Any]:
65
+ """Create a sync function that calls a tool on the environment.
66
+
67
+ This handles the complexity of running async code from sync context,
68
+ including when already in an async event loop.
69
+
70
+ Args:
71
+ env: Environment with call_tool method.
72
+ tool_name: Name of the tool to call.
73
+ description: Optional description for the function docstring.
74
+
75
+ Returns:
76
+ Sync function that calls the tool and returns string result.
77
+ """
78
+ import asyncio
79
+
80
+ def sync_fn(**kwargs: Any) -> str:
81
+ loop = asyncio.get_event_loop()
82
+ if loop.is_running():
83
+ import concurrent.futures
84
+
85
+ with concurrent.futures.ThreadPoolExecutor() as executor:
86
+ future = executor.submit(asyncio.run, env.call_tool(tool_name, **kwargs))
87
+ result = future.result()
88
+ else:
89
+ result = loop.run_until_complete(env.call_tool(tool_name, **kwargs))
90
+
91
+ return stringify_result(result)
92
+
93
+ sync_fn.__name__ = tool_name
94
+ sync_fn.__doc__ = description or f"Tool: {tool_name}"
95
+ return sync_fn
96
+
97
+
98
+ def create_tool_fns(
99
+ env: Any,
100
+ tool: mcp_types.Tool,
101
+ ) -> tuple[Callable[..., str], Callable[..., Any]]:
102
+ """Create both sync and async functions for a tool.
103
+
104
+ Args:
105
+ env: Environment with call_tool method.
106
+ tool: MCP tool definition.
107
+
108
+ Returns:
109
+ Tuple of (sync_fn, async_fn).
110
+ """
111
+ sync_fn = create_sync_tool_fn(env, tool.name, tool.description)
112
+ async_fn = create_async_tool_fn(env, tool.name, tool.description)
113
+ return sync_fn, async_fn
hud/eval/__init__.py ADDED
@@ -0,0 +1,67 @@
1
+ """HUD Eval - Evaluation context and management.
2
+
3
+ This module provides:
4
+ - Task: A runnable evaluation unit (from env())
5
+ - EvalContext: Environment with evaluation tracking (trace_id, reward, etc.)
6
+ - eval(): Standalone context manager for task-based evaluation
7
+
8
+ Usage:
9
+ # Using env() to create Task
10
+ env = Environment("my-env").connect_hub("browser")
11
+
12
+ async with env() as ctx:
13
+ await ctx.call_tool("navigate", url="...")
14
+
15
+ async with env("checkout", user_id="alice") as ctx:
16
+ await agent.run(ctx.prompt)
17
+
18
+ # Standalone with task slugs
19
+ async with hud.eval("my-org/task:1") as ctx:
20
+ await agent.run(ctx)
21
+
22
+ # Orchestrated with Task objects
23
+ tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
24
+ async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
25
+ await agent.run(ctx.prompt)
26
+
27
+ # Blank eval for manual reward
28
+ async with hud.eval() as ctx:
29
+ ctx.reward = compute_reward()
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ from typing import TYPE_CHECKING
35
+
36
+ # Auto-instrument httpx on import
37
+ import hud.eval.instrument # noqa: F401
38
+
39
+ # run_eval is safe to import (uses lazy imports internally)
40
+ from hud.eval.manager import run_eval
41
+
42
+ # Task is safe to import
43
+ from hud.eval.task import Task
44
+
45
+ # Utils for v4 format handling
46
+ from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
47
+
48
+ if TYPE_CHECKING:
49
+ from hud.eval.context import EvalContext
50
+
51
+ __all__ = [
52
+ "EvalContext",
53
+ "Task",
54
+ "build_env_from_v4",
55
+ "is_v4_format",
56
+ "run_eval",
57
+ "validate_v4_task",
58
+ ]
59
+
60
+
61
+ def __getattr__(name: str) -> object:
62
+ """Lazy import EvalContext to avoid circular imports."""
63
+ if name == "EvalContext":
64
+ from hud.eval.context import EvalContext
65
+
66
+ return EvalContext
67
+ raise AttributeError(f"module 'hud.eval' has no attribute {name!r}")