hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Tool format parsing and conversion for OpenAI, Claude, Gemini, and MCP."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from enum import Enum, auto
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from hud.types import MCPToolCall, MCPToolResult
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"ToolFormat",
|
|
13
|
+
"format_result",
|
|
14
|
+
"parse_tool_call",
|
|
15
|
+
"parse_tool_calls",
|
|
16
|
+
"result_to_string",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ToolFormat(Enum):
|
|
21
|
+
"""Detected tool call format."""
|
|
22
|
+
|
|
23
|
+
OPENAI = auto() # function.arguments as JSON string
|
|
24
|
+
CLAUDE = auto() # type="tool_use", input as dict
|
|
25
|
+
GEMINI = auto() # functionCall with args
|
|
26
|
+
MCP = auto() # name + arguments
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# -----------------------------------------------------------------------------
|
|
30
|
+
# Parsing
|
|
31
|
+
# -----------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _to_dict(obj: Any) -> dict[str, Any]:
|
|
35
|
+
"""Convert object to dict for uniform processing."""
|
|
36
|
+
if isinstance(obj, dict):
|
|
37
|
+
return obj
|
|
38
|
+
if hasattr(obj, "model_dump"):
|
|
39
|
+
return obj.model_dump()
|
|
40
|
+
if hasattr(obj, "__dict__"):
|
|
41
|
+
return vars(obj)
|
|
42
|
+
raise ValueError(f"Cannot convert {type(obj).__name__} to dict")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _parse_json_args(args: Any) -> dict[str, Any]:
|
|
46
|
+
"""Parse arguments, handling JSON strings."""
|
|
47
|
+
if not args:
|
|
48
|
+
return {}
|
|
49
|
+
if isinstance(args, str):
|
|
50
|
+
try:
|
|
51
|
+
return json.loads(args)
|
|
52
|
+
except json.JSONDecodeError:
|
|
53
|
+
return {}
|
|
54
|
+
return args
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_tool_call(call: Any, **kwargs: Any) -> tuple[MCPToolCall, ToolFormat]:
|
|
58
|
+
"""Parse any tool call format into (MCPToolCall, ToolFormat).
|
|
59
|
+
|
|
60
|
+
Supports:
|
|
61
|
+
- String (tool name only, or with kwargs)
|
|
62
|
+
- Tuple: (name,), (name, args), (name, args, id)
|
|
63
|
+
- MCPToolCall
|
|
64
|
+
- OpenAI: {function: {name, arguments}, id}
|
|
65
|
+
- Claude: {type: "tool_use", name, input, id}
|
|
66
|
+
- Gemini: {functionCall: {name, args}} or {name, args}
|
|
67
|
+
- Generic: {name, arguments}
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
call: Tool call in any supported format.
|
|
71
|
+
**kwargs: Additional arguments (merged when call is a string).
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
Tuple of (MCPToolCall, ToolFormat) for the parsed call.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If format is unrecognized.
|
|
78
|
+
"""
|
|
79
|
+
# Primitives
|
|
80
|
+
if isinstance(call, str):
|
|
81
|
+
return MCPToolCall(name=call, arguments=kwargs or {}), ToolFormat.MCP
|
|
82
|
+
|
|
83
|
+
if isinstance(call, tuple):
|
|
84
|
+
tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
|
|
85
|
+
if len(call) > 2:
|
|
86
|
+
tc.id = call[2]
|
|
87
|
+
return tc, ToolFormat.MCP
|
|
88
|
+
|
|
89
|
+
if isinstance(call, MCPToolCall):
|
|
90
|
+
return call, ToolFormat.MCP
|
|
91
|
+
|
|
92
|
+
# Convert to dict
|
|
93
|
+
d = _to_dict(call)
|
|
94
|
+
|
|
95
|
+
# OpenAI: {function: {name, arguments}, id}
|
|
96
|
+
if "function" in d:
|
|
97
|
+
f = _to_dict(d["function"]) if not isinstance(d["function"], dict) else d["function"]
|
|
98
|
+
tc = MCPToolCall(name=f["name"], arguments=_parse_json_args(f.get("arguments")))
|
|
99
|
+
if d.get("id"):
|
|
100
|
+
tc.id = d["id"]
|
|
101
|
+
return tc, ToolFormat.OPENAI
|
|
102
|
+
|
|
103
|
+
# Claude: {type: "tool_use", name, input, id}
|
|
104
|
+
if d.get("type") == "tool_use":
|
|
105
|
+
tc = MCPToolCall(name=d["name"], arguments=d.get("input") or {})
|
|
106
|
+
if d.get("id"):
|
|
107
|
+
tc.id = d["id"]
|
|
108
|
+
return tc, ToolFormat.CLAUDE
|
|
109
|
+
|
|
110
|
+
# Gemini: {functionCall: {name, args}} or {name, args}
|
|
111
|
+
if "functionCall" in d:
|
|
112
|
+
fc = d["functionCall"]
|
|
113
|
+
return MCPToolCall(name=fc["name"], arguments=fc.get("args") or {}), ToolFormat.GEMINI
|
|
114
|
+
|
|
115
|
+
if "args" in d and "name" in d and "arguments" not in d:
|
|
116
|
+
return MCPToolCall(name=d["name"], arguments=d.get("args") or {}), ToolFormat.GEMINI
|
|
117
|
+
|
|
118
|
+
# Generic: {name, arguments/input}
|
|
119
|
+
if "name" in d:
|
|
120
|
+
tc = MCPToolCall(name=d["name"], arguments=d.get("arguments") or d.get("input") or {})
|
|
121
|
+
if d.get("id"):
|
|
122
|
+
tc.id = d["id"]
|
|
123
|
+
return tc, ToolFormat.MCP
|
|
124
|
+
|
|
125
|
+
raise ValueError(f"Unrecognized tool call format: {list(d.keys())}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _is_tool_block(item: Any) -> bool:
|
|
129
|
+
"""Check if item is a tool call (not text/other content)."""
|
|
130
|
+
t = item.get("type") if isinstance(item, dict) else getattr(item, "type", None)
|
|
131
|
+
return t is None or t in ("tool_use", "function")
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def parse_tool_calls(calls: Any) -> list[tuple[MCPToolCall, ToolFormat]]:
|
|
135
|
+
"""Parse multiple tool calls, filtering non-tool content (e.g. Claude TextBlock).
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
calls: Single call or list of calls in any format.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
List of (MCPToolCall, ToolFormat) tuples.
|
|
142
|
+
"""
|
|
143
|
+
if calls is None:
|
|
144
|
+
return []
|
|
145
|
+
if not isinstance(calls, list):
|
|
146
|
+
try:
|
|
147
|
+
return [parse_tool_call(calls)]
|
|
148
|
+
except ValueError:
|
|
149
|
+
return []
|
|
150
|
+
|
|
151
|
+
results = []
|
|
152
|
+
for item in calls:
|
|
153
|
+
if not _is_tool_block(item):
|
|
154
|
+
continue
|
|
155
|
+
try:
|
|
156
|
+
results.append(parse_tool_call(item))
|
|
157
|
+
except ValueError:
|
|
158
|
+
continue
|
|
159
|
+
return results
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# -----------------------------------------------------------------------------
|
|
163
|
+
# Result Formatting
|
|
164
|
+
# -----------------------------------------------------------------------------
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def result_to_string(result: MCPToolResult) -> str:
|
|
168
|
+
"""Convert MCPToolResult content to string.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
result: MCP tool result with content blocks.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
String representation of the result content.
|
|
175
|
+
"""
|
|
176
|
+
if not result.content:
|
|
177
|
+
return ""
|
|
178
|
+
parts = []
|
|
179
|
+
for block in result.content:
|
|
180
|
+
if (text := getattr(block, "text", None)) is not None:
|
|
181
|
+
parts.append(str(text))
|
|
182
|
+
elif (data := getattr(block, "data", None)) is not None:
|
|
183
|
+
parts.append(f"[binary: {len(data)} bytes]")
|
|
184
|
+
return "\n".join(parts)
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def format_result(result: MCPToolResult, tc: MCPToolCall, fmt: ToolFormat) -> Any:
|
|
188
|
+
"""Format MCPToolResult based on the input format.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
result: MCP tool result.
|
|
192
|
+
tc: Original tool call (for id/name).
|
|
193
|
+
fmt: Target format.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
OpenAI: {"role": "tool", "tool_call_id": ..., "content": ...}
|
|
197
|
+
Claude: {"type": "tool_result", "tool_use_id": ..., "content": ..., "is_error"?: bool}
|
|
198
|
+
Gemini: {"functionResponse": {"name": ..., "response": {"result": ...}}}
|
|
199
|
+
MCP: MCPToolResult unchanged
|
|
200
|
+
"""
|
|
201
|
+
content = result_to_string(result)
|
|
202
|
+
|
|
203
|
+
if fmt == ToolFormat.OPENAI:
|
|
204
|
+
return {"role": "tool", "tool_call_id": tc.id, "content": content}
|
|
205
|
+
|
|
206
|
+
if fmt == ToolFormat.CLAUDE:
|
|
207
|
+
r: dict[str, Any] = {"type": "tool_result", "tool_use_id": tc.id, "content": content}
|
|
208
|
+
if result.isError:
|
|
209
|
+
r["is_error"] = True
|
|
210
|
+
return r
|
|
211
|
+
|
|
212
|
+
if fmt == ToolFormat.GEMINI:
|
|
213
|
+
return {"functionResponse": {"name": tc.name, "response": {"result": content}}}
|
|
214
|
+
|
|
215
|
+
return result # MCP format - return as-is
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""Schema utilities for tool definitions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ensure_strict_schema",
|
|
10
|
+
"json_type_to_python",
|
|
11
|
+
"schema_to_pydantic",
|
|
12
|
+
"validate_openai_schema",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def ensure_strict_schema(schema: dict[str, Any]) -> dict[str, Any]:
|
|
19
|
+
"""Ensure a JSON schema is compatible with OpenAI's strict mode.
|
|
20
|
+
|
|
21
|
+
OpenAI strict mode requires:
|
|
22
|
+
- additionalProperties: false on all objects
|
|
23
|
+
- All properties must be in required
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
schema: Original JSON schema.
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Modified schema for strict mode.
|
|
30
|
+
"""
|
|
31
|
+
schema = dict(schema)
|
|
32
|
+
|
|
33
|
+
if schema.get("type") == "object":
|
|
34
|
+
schema["additionalProperties"] = False
|
|
35
|
+
|
|
36
|
+
if "properties" in schema:
|
|
37
|
+
# All properties must be required
|
|
38
|
+
schema["required"] = list(schema["properties"].keys())
|
|
39
|
+
|
|
40
|
+
# Recursively process nested objects
|
|
41
|
+
for prop_schema in schema["properties"].values():
|
|
42
|
+
if isinstance(prop_schema, dict):
|
|
43
|
+
_ensure_strict_recursive(prop_schema)
|
|
44
|
+
|
|
45
|
+
return schema
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _ensure_strict_recursive(schema: dict[str, Any]) -> None:
|
|
49
|
+
"""Recursively apply strict mode to nested schemas."""
|
|
50
|
+
if schema.get("type") == "object":
|
|
51
|
+
schema["additionalProperties"] = False
|
|
52
|
+
if "properties" in schema:
|
|
53
|
+
schema["required"] = list(schema["properties"].keys())
|
|
54
|
+
for prop_schema in schema["properties"].values():
|
|
55
|
+
if isinstance(prop_schema, dict):
|
|
56
|
+
_ensure_strict_recursive(prop_schema)
|
|
57
|
+
|
|
58
|
+
elif schema.get("type") == "array" and "items" in schema:
|
|
59
|
+
if isinstance(schema["items"], dict):
|
|
60
|
+
_ensure_strict_recursive(schema["items"])
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def schema_to_pydantic(name: str, schema: dict[str, Any]) -> type:
|
|
64
|
+
"""Convert JSON schema to a Pydantic model.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
name: Model name (used for class name).
|
|
68
|
+
schema: JSON schema with properties.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dynamically created Pydantic model class.
|
|
72
|
+
"""
|
|
73
|
+
from pydantic import Field, create_model
|
|
74
|
+
|
|
75
|
+
properties = schema.get("properties", {})
|
|
76
|
+
required = set(schema.get("required", []))
|
|
77
|
+
|
|
78
|
+
fields = {}
|
|
79
|
+
for prop_name, prop_schema in properties.items():
|
|
80
|
+
prop_type = json_type_to_python(prop_schema.get("type", "string"))
|
|
81
|
+
default = ... if prop_name in required else None
|
|
82
|
+
description = prop_schema.get("description", "")
|
|
83
|
+
fields[prop_name] = (prop_type, Field(default=default, description=description))
|
|
84
|
+
|
|
85
|
+
return create_model(f"{name}Input", **fields)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def json_type_to_python(json_type: str) -> type:
|
|
89
|
+
"""Map JSON schema type to Python type.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
json_type: JSON schema type string.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Corresponding Python type.
|
|
96
|
+
"""
|
|
97
|
+
mapping = {
|
|
98
|
+
"string": str,
|
|
99
|
+
"integer": int,
|
|
100
|
+
"number": float,
|
|
101
|
+
"boolean": bool,
|
|
102
|
+
"array": list,
|
|
103
|
+
"object": dict,
|
|
104
|
+
}
|
|
105
|
+
return mapping.get(json_type, str)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def validate_openai_schema(
|
|
109
|
+
schema: dict[str, Any],
|
|
110
|
+
tool_name: str = "unknown",
|
|
111
|
+
path: str = "",
|
|
112
|
+
) -> list[str]:
|
|
113
|
+
"""Validate a JSON schema for OpenAI API compatibility.
|
|
114
|
+
|
|
115
|
+
OpenAI's API has specific requirements for tool schemas:
|
|
116
|
+
- Arrays must have 'items' (not 'prefixItems' which tuples generate)
|
|
117
|
+
- Certain schema features like 'prefixItems' are not supported
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
schema: JSON schema to validate.
|
|
121
|
+
tool_name: Name of the tool (for error messages).
|
|
122
|
+
path: Current path in schema (for error context).
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of validation error messages. Empty if valid.
|
|
126
|
+
"""
|
|
127
|
+
errors: list[str] = []
|
|
128
|
+
|
|
129
|
+
if not isinstance(schema, dict):
|
|
130
|
+
return errors
|
|
131
|
+
|
|
132
|
+
# Check for prefixItems (generated by tuple types)
|
|
133
|
+
if "prefixItems" in schema:
|
|
134
|
+
errors.append(
|
|
135
|
+
f"Tool '{tool_name}' has 'prefixItems' at {path or 'root'} "
|
|
136
|
+
"(likely from tuple type). Use list[Model] instead of tuple."
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Check arrays have 'items'
|
|
140
|
+
if schema.get("type") == "array" and "items" not in schema and "prefixItems" not in schema:
|
|
141
|
+
errors.append(
|
|
142
|
+
f"Tool '{tool_name}' has array at {path or 'root'} without 'items'. "
|
|
143
|
+
"OpenAI requires 'items' for array schemas."
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Recursively check nested schemas
|
|
147
|
+
# Check properties
|
|
148
|
+
if "properties" in schema:
|
|
149
|
+
for prop_name, prop_schema in schema["properties"].items():
|
|
150
|
+
prop_path = f"{path}.{prop_name}" if path else prop_name
|
|
151
|
+
errors.extend(validate_openai_schema(prop_schema, tool_name, prop_path))
|
|
152
|
+
|
|
153
|
+
# Check items
|
|
154
|
+
if "items" in schema and isinstance(schema["items"], dict):
|
|
155
|
+
items_path = f"{path}[items]" if path else "[items]"
|
|
156
|
+
errors.extend(validate_openai_schema(schema["items"], tool_name, items_path))
|
|
157
|
+
|
|
158
|
+
# Check anyOf/oneOf/allOf
|
|
159
|
+
for key in ("anyOf", "oneOf", "allOf"):
|
|
160
|
+
if key in schema:
|
|
161
|
+
for i, sub_schema in enumerate(schema[key]):
|
|
162
|
+
sub_path = f"{path}.{key}[{i}]" if path else f"{key}[{i}]"
|
|
163
|
+
errors.extend(validate_openai_schema(sub_schema, tool_name, sub_path))
|
|
164
|
+
|
|
165
|
+
# Check $defs (definitions)
|
|
166
|
+
if "$defs" in schema:
|
|
167
|
+
for def_name, def_schema in schema["$defs"].items():
|
|
168
|
+
def_path = f"$defs.{def_name}"
|
|
169
|
+
errors.extend(validate_openai_schema(def_schema, tool_name, def_path))
|
|
170
|
+
|
|
171
|
+
return errors
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"""Shared tool wrapper utilities for agent framework integrations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
|
|
11
|
+
import mcp.types as mcp_types
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"create_async_tool_fn",
|
|
15
|
+
"create_sync_tool_fn",
|
|
16
|
+
"create_tool_fns",
|
|
17
|
+
"stringify_result",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def stringify_result(result: Any) -> str:
|
|
22
|
+
"""Convert a tool result to string format.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
result: The tool result (str, dict, or other).
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
String representation of the result.
|
|
29
|
+
"""
|
|
30
|
+
if isinstance(result, str):
|
|
31
|
+
return result
|
|
32
|
+
return json.dumps(result) if result else ""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def create_async_tool_fn(
|
|
36
|
+
env: Any,
|
|
37
|
+
tool_name: str,
|
|
38
|
+
description: str | None = None,
|
|
39
|
+
) -> Callable[..., Any]:
|
|
40
|
+
"""Create an async function that calls a tool on the environment.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
env: Environment with call_tool method.
|
|
44
|
+
tool_name: Name of the tool to call.
|
|
45
|
+
description: Optional description for the function docstring.
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Async function that calls the tool and returns string result.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
async def async_fn(**kwargs: Any) -> str:
|
|
52
|
+
result = await env.call_tool(tool_name, **kwargs)
|
|
53
|
+
return stringify_result(result)
|
|
54
|
+
|
|
55
|
+
async_fn.__name__ = tool_name
|
|
56
|
+
async_fn.__doc__ = description or f"Tool: {tool_name}"
|
|
57
|
+
return async_fn
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def create_sync_tool_fn(
|
|
61
|
+
env: Any,
|
|
62
|
+
tool_name: str,
|
|
63
|
+
description: str | None = None,
|
|
64
|
+
) -> Callable[..., Any]:
|
|
65
|
+
"""Create a sync function that calls a tool on the environment.
|
|
66
|
+
|
|
67
|
+
This handles the complexity of running async code from sync context,
|
|
68
|
+
including when already in an async event loop.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
env: Environment with call_tool method.
|
|
72
|
+
tool_name: Name of the tool to call.
|
|
73
|
+
description: Optional description for the function docstring.
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
Sync function that calls the tool and returns string result.
|
|
77
|
+
"""
|
|
78
|
+
import asyncio
|
|
79
|
+
|
|
80
|
+
def sync_fn(**kwargs: Any) -> str:
|
|
81
|
+
loop = asyncio.get_event_loop()
|
|
82
|
+
if loop.is_running():
|
|
83
|
+
import concurrent.futures
|
|
84
|
+
|
|
85
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
86
|
+
future = executor.submit(asyncio.run, env.call_tool(tool_name, **kwargs))
|
|
87
|
+
result = future.result()
|
|
88
|
+
else:
|
|
89
|
+
result = loop.run_until_complete(env.call_tool(tool_name, **kwargs))
|
|
90
|
+
|
|
91
|
+
return stringify_result(result)
|
|
92
|
+
|
|
93
|
+
sync_fn.__name__ = tool_name
|
|
94
|
+
sync_fn.__doc__ = description or f"Tool: {tool_name}"
|
|
95
|
+
return sync_fn
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def create_tool_fns(
|
|
99
|
+
env: Any,
|
|
100
|
+
tool: mcp_types.Tool,
|
|
101
|
+
) -> tuple[Callable[..., str], Callable[..., Any]]:
|
|
102
|
+
"""Create both sync and async functions for a tool.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
env: Environment with call_tool method.
|
|
106
|
+
tool: MCP tool definition.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Tuple of (sync_fn, async_fn).
|
|
110
|
+
"""
|
|
111
|
+
sync_fn = create_sync_tool_fn(env, tool.name, tool.description)
|
|
112
|
+
async_fn = create_async_tool_fn(env, tool.name, tool.description)
|
|
113
|
+
return sync_fn, async_fn
|
hud/eval/__init__.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""HUD Eval - Evaluation context and management.
|
|
2
|
+
|
|
3
|
+
This module provides:
|
|
4
|
+
- Task: A runnable evaluation unit (from env())
|
|
5
|
+
- EvalContext: Environment with evaluation tracking (trace_id, reward, etc.)
|
|
6
|
+
- eval(): Standalone context manager for task-based evaluation
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
# Using env() to create Task
|
|
10
|
+
env = Environment("my-env").connect_hub("browser")
|
|
11
|
+
|
|
12
|
+
async with env() as ctx:
|
|
13
|
+
await ctx.call_tool("navigate", url="...")
|
|
14
|
+
|
|
15
|
+
async with env("checkout", user_id="alice") as ctx:
|
|
16
|
+
await agent.run(ctx.prompt)
|
|
17
|
+
|
|
18
|
+
# Standalone with task slugs
|
|
19
|
+
async with hud.eval("my-org/task:1") as ctx:
|
|
20
|
+
await agent.run(ctx)
|
|
21
|
+
|
|
22
|
+
# Orchestrated with Task objects
|
|
23
|
+
tasks = [env("checkout", user_id="alice"), env("checkout", user_id="bob")]
|
|
24
|
+
async with hud.eval(tasks, variants={"model": ["gpt-4o"]}, group=4) as ctx:
|
|
25
|
+
await agent.run(ctx.prompt)
|
|
26
|
+
|
|
27
|
+
# Blank eval for manual reward
|
|
28
|
+
async with hud.eval() as ctx:
|
|
29
|
+
ctx.reward = compute_reward()
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
from typing import TYPE_CHECKING
|
|
35
|
+
|
|
36
|
+
# Auto-instrument httpx on import
|
|
37
|
+
import hud.eval.instrument # noqa: F401
|
|
38
|
+
|
|
39
|
+
# run_eval is safe to import (uses lazy imports internally)
|
|
40
|
+
from hud.eval.manager import run_eval
|
|
41
|
+
|
|
42
|
+
# Task is safe to import
|
|
43
|
+
from hud.eval.task import Task
|
|
44
|
+
|
|
45
|
+
# Utils for v4 format handling
|
|
46
|
+
from hud.eval.utils import build_env_from_v4, is_v4_format, validate_v4_task
|
|
47
|
+
|
|
48
|
+
if TYPE_CHECKING:
|
|
49
|
+
from hud.eval.context import EvalContext
|
|
50
|
+
|
|
51
|
+
__all__ = [
|
|
52
|
+
"EvalContext",
|
|
53
|
+
"Task",
|
|
54
|
+
"build_env_from_v4",
|
|
55
|
+
"is_v4_format",
|
|
56
|
+
"run_eval",
|
|
57
|
+
"validate_v4_task",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def __getattr__(name: str) -> object:
|
|
62
|
+
"""Lazy import EvalContext to avoid circular imports."""
|
|
63
|
+
if name == "EvalContext":
|
|
64
|
+
from hud.eval.context import EvalContext
|
|
65
|
+
|
|
66
|
+
return EvalContext
|
|
67
|
+
raise AttributeError(f"module 'hud.eval' has no attribute {name!r}")
|