hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/eval/utils.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""Utility functions for the eval module."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import warnings
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
__all__ = ["build_env_from_v4", "is_v4_format", "validate_v4_task"]
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def is_v4_format(data: dict[str, Any]) -> bool:
|
|
15
|
+
"""Detect if dict looks like v4 LegacyTask format.
|
|
16
|
+
|
|
17
|
+
Used for branching logic. Checks if data has the core v4 fields
|
|
18
|
+
(prompt AND mcp_config). Does NOT validate completeness.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
data: Dict to check
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
True if looks like v4 format, False otherwise
|
|
25
|
+
"""
|
|
26
|
+
if not isinstance(data, dict):
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
# Core v4 detection: prompt + mcp_config
|
|
30
|
+
return bool(data.get("prompt")) and bool(data.get("mcp_config"))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_v4_task(data: dict[str, Any]) -> None:
|
|
34
|
+
"""Validate v4 task has all required fields.
|
|
35
|
+
|
|
36
|
+
A valid v4 task must have all three required fields:
|
|
37
|
+
- prompt: The task instruction
|
|
38
|
+
- mcp_config: MCP server configuration
|
|
39
|
+
- evaluate_tool: How to evaluate success
|
|
40
|
+
|
|
41
|
+
Call this after is_v4_format() when you need to ensure completeness.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
data: Dict to validate
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
ValueError: If any required fields are missing
|
|
48
|
+
"""
|
|
49
|
+
missing = []
|
|
50
|
+
if not data.get("prompt"):
|
|
51
|
+
missing.append("prompt")
|
|
52
|
+
if not data.get("mcp_config"):
|
|
53
|
+
missing.append("mcp_config")
|
|
54
|
+
if not data.get("evaluate_tool"):
|
|
55
|
+
missing.append("evaluate_tool")
|
|
56
|
+
|
|
57
|
+
if missing:
|
|
58
|
+
raise ValueError(f"v4 task missing required fields: {', '.join(missing)}")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def build_env_from_v4(source: dict[str, Any] | Any) -> dict[str, Any]:
|
|
62
|
+
"""Build Environment from v4 LegacyTask format.
|
|
63
|
+
|
|
64
|
+
Creates an Environment configured with the legacy task's fields.
|
|
65
|
+
Returns a dict ready to be passed to Task() constructor.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
source: dict or LegacyTask with v4 fields (prompt, mcp_config, etc.)
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dict with Task fields: env, id, scenario, args, validation, system_prompt, metadata
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
TypeError: If source is not a dict or LegacyTask
|
|
75
|
+
"""
|
|
76
|
+
from hud.environment import Environment
|
|
77
|
+
from hud.types import LegacyTask, MCPToolCall
|
|
78
|
+
|
|
79
|
+
# Convert dict to LegacyTask if needed
|
|
80
|
+
if isinstance(source, dict):
|
|
81
|
+
with warnings.catch_warnings():
|
|
82
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
83
|
+
legacy = LegacyTask(**source)
|
|
84
|
+
elif isinstance(source, LegacyTask):
|
|
85
|
+
legacy = source
|
|
86
|
+
else:
|
|
87
|
+
raise TypeError(f"Expected dict or LegacyTask, got {type(source).__name__}")
|
|
88
|
+
|
|
89
|
+
# Warn if using local MCP configs (command without url)
|
|
90
|
+
_warn_local_mcp(legacy.mcp_config)
|
|
91
|
+
|
|
92
|
+
# Extract tool filters from agent_config (v4 style)
|
|
93
|
+
# These are agent-level filters, not connection-level
|
|
94
|
+
include_tools: list[str] | None = None
|
|
95
|
+
exclude_tools: list[str] | None = None
|
|
96
|
+
if legacy.agent_config:
|
|
97
|
+
include_tools = legacy.agent_config.allowed_tools
|
|
98
|
+
exclude_tools = legacy.agent_config.disallowed_tools
|
|
99
|
+
|
|
100
|
+
# Convert ["*"] wildcard to None (meaning include all)
|
|
101
|
+
if include_tools == ["*"]:
|
|
102
|
+
include_tools = None
|
|
103
|
+
|
|
104
|
+
# Create Environment - NO connections made here, just config stored
|
|
105
|
+
env = Environment(legacy.id or "v4-legacy")
|
|
106
|
+
env.connect_mcp_config(legacy.mcp_config)
|
|
107
|
+
|
|
108
|
+
# Store agent-level tool filters on Environment (applied in as_tools())
|
|
109
|
+
# This allows Environment to call setup/evaluate while hiding them from agent
|
|
110
|
+
env._agent_include = include_tools
|
|
111
|
+
env._agent_exclude = exclude_tools
|
|
112
|
+
|
|
113
|
+
# Set the prompt
|
|
114
|
+
env.prompt = legacy.prompt
|
|
115
|
+
|
|
116
|
+
# Add setup_tool calls (stored, not executed)
|
|
117
|
+
if legacy.setup_tool:
|
|
118
|
+
setup_calls = legacy.setup_tool
|
|
119
|
+
if not isinstance(setup_calls, list):
|
|
120
|
+
setup_calls = [setup_calls]
|
|
121
|
+
for call in setup_calls:
|
|
122
|
+
env.setup_tool(call.name, **(call.arguments or {}))
|
|
123
|
+
|
|
124
|
+
# Add evaluate_tool calls (stored, not executed)
|
|
125
|
+
if legacy.evaluate_tool:
|
|
126
|
+
eval_calls = legacy.evaluate_tool
|
|
127
|
+
if not isinstance(eval_calls, list):
|
|
128
|
+
eval_calls = [eval_calls]
|
|
129
|
+
for call in eval_calls:
|
|
130
|
+
env.evaluate_tool(call.name, **(call.arguments or {}))
|
|
131
|
+
|
|
132
|
+
# Build Task fields dict
|
|
133
|
+
result: dict[str, Any] = {
|
|
134
|
+
"env": env,
|
|
135
|
+
"id": legacy.id,
|
|
136
|
+
"scenario": None, # v4 uses prompt, not scenarios
|
|
137
|
+
"args": {},
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Map integration_test_tool → validation (same concept: tool calls to verify)
|
|
141
|
+
# Also populate _integration_test_calls for IntegrationTestRunner compatibility
|
|
142
|
+
if legacy.integration_test_tool:
|
|
143
|
+
int_test = legacy.integration_test_tool
|
|
144
|
+
if not isinstance(int_test, list):
|
|
145
|
+
int_test = [int_test]
|
|
146
|
+
# Convert to MCPToolCall if needed
|
|
147
|
+
result["validation"] = [
|
|
148
|
+
call if isinstance(call, MCPToolCall) else MCPToolCall(**call.model_dump())
|
|
149
|
+
for call in int_test
|
|
150
|
+
]
|
|
151
|
+
# Populate _integration_test_calls on env for IntegrationTestRunner
|
|
152
|
+
env._integration_test_calls = [(call.name, call.arguments or {}) for call in int_test]
|
|
153
|
+
|
|
154
|
+
# Extract agent_config fields that need to be passed through
|
|
155
|
+
if legacy.agent_config:
|
|
156
|
+
agent_config_dict: dict[str, Any] = {}
|
|
157
|
+
if legacy.agent_config.system_prompt:
|
|
158
|
+
agent_config_dict["system_prompt"] = legacy.agent_config.system_prompt
|
|
159
|
+
if legacy.agent_config.append_setup_output:
|
|
160
|
+
agent_config_dict["append_setup_output"] = legacy.agent_config.append_setup_output
|
|
161
|
+
if legacy.agent_config.append_setup_tool:
|
|
162
|
+
agent_config_dict["append_setup_tool"] = legacy.agent_config.append_setup_tool
|
|
163
|
+
if agent_config_dict:
|
|
164
|
+
result["agent_config"] = agent_config_dict
|
|
165
|
+
|
|
166
|
+
# Preserve metadata
|
|
167
|
+
if legacy.metadata:
|
|
168
|
+
result["metadata"] = legacy.metadata
|
|
169
|
+
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _warn_local_mcp(mcp_config: dict[str, Any] | None) -> None:
|
|
174
|
+
"""Warn if mcp_config uses local MCP servers (command without url).
|
|
175
|
+
|
|
176
|
+
Local MCP servers can cause port conflicts when running tasks concurrently.
|
|
177
|
+
"""
|
|
178
|
+
if not mcp_config:
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
has_local = any(
|
|
182
|
+
isinstance(server_cfg, dict) and "command" in server_cfg and not server_cfg.get("url")
|
|
183
|
+
for server_cfg in mcp_config.values()
|
|
184
|
+
if isinstance(server_cfg, dict)
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if has_local:
|
|
188
|
+
warnings.warn(
|
|
189
|
+
"Task uses local MCP configuration (command without url). "
|
|
190
|
+
"This may cause port conflicts when running tasks concurrently. "
|
|
191
|
+
"Consider using remote MCP servers for parallel execution.",
|
|
192
|
+
UserWarning,
|
|
193
|
+
stacklevel=4,
|
|
194
|
+
)
|
hud/patches/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HUD runtime patches for third-party libraries.
|
|
3
|
+
|
|
4
|
+
This module applies monkey-patches to fix issues in dependencies
|
|
5
|
+
without requiring forked packages.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from hud.patches.mcp_patches import apply_all_patches, suppress_fastmcp_logging
|
|
9
|
+
from hud.patches.warnings import apply_default_warning_filters, suppress_mcp_use_import_warnings
|
|
10
|
+
|
|
11
|
+
# Apply patches on import
|
|
12
|
+
apply_all_patches()
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"apply_all_patches",
|
|
16
|
+
"apply_default_warning_filters",
|
|
17
|
+
"suppress_fastmcp_logging",
|
|
18
|
+
"suppress_mcp_use_import_warnings",
|
|
19
|
+
]
|
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Runtime patches for the standard mcp package.
|
|
3
|
+
|
|
4
|
+
These patches apply fixes from the HUD fork without requiring a separate package.
|
|
5
|
+
Import this module early (e.g., in hud/__init__.py) to apply patches.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def patch_streamable_http_error_handling() -> None:
|
|
17
|
+
"""
|
|
18
|
+
Patch StreamableHTTPTransport.post_writer to handle request errors properly.
|
|
19
|
+
|
|
20
|
+
The original implementation doesn't catch errors in handle_request_async,
|
|
21
|
+
which can cause the client to hang indefinitely. This patch wraps the handler
|
|
22
|
+
to send a proper JSONRPCError response when transport errors occur (e.g.,
|
|
23
|
+
ReadTimeout), allowing the waiting caller to receive the error and fail
|
|
24
|
+
gracefully instead of hanging.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
from mcp.client.streamable_http import StreamableHTTPTransport
|
|
28
|
+
|
|
29
|
+
async def patched_post_writer(
|
|
30
|
+
self: Any,
|
|
31
|
+
client: Any,
|
|
32
|
+
write_stream_reader: Any,
|
|
33
|
+
read_stream_writer: Any,
|
|
34
|
+
write_stream: Any,
|
|
35
|
+
start_get_stream: Any,
|
|
36
|
+
tg: Any,
|
|
37
|
+
) -> None:
|
|
38
|
+
import asyncio
|
|
39
|
+
import ssl
|
|
40
|
+
import time
|
|
41
|
+
|
|
42
|
+
import httpx
|
|
43
|
+
from mcp.client.streamable_http import RequestContext
|
|
44
|
+
from mcp.shared.message import ClientMessageMetadata, SessionMessage
|
|
45
|
+
from mcp.types import ErrorData, JSONRPCError, JSONRPCMessage, JSONRPCRequest
|
|
46
|
+
|
|
47
|
+
from hud.settings import settings
|
|
48
|
+
|
|
49
|
+
async def handle_request_async(ctx: RequestContext, is_resumption: bool) -> None:
|
|
50
|
+
msg = ctx.session_message.message
|
|
51
|
+
# Use configured timeout, minimum 30s to prevent instant failures
|
|
52
|
+
timeout = max(settings.client_timeout, 15.0)
|
|
53
|
+
deadline = time.monotonic() + timeout
|
|
54
|
+
retryable = (
|
|
55
|
+
httpx.ConnectError,
|
|
56
|
+
httpx.ReadError,
|
|
57
|
+
httpx.TimeoutException,
|
|
58
|
+
ssl.SSLError,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
async def send_error_response(exc: Exception) -> None:
|
|
62
|
+
"""Send an error response to the client."""
|
|
63
|
+
if isinstance(msg.root, JSONRPCRequest):
|
|
64
|
+
error_response = JSONRPCError(
|
|
65
|
+
jsonrpc="2.0",
|
|
66
|
+
id=msg.root.id,
|
|
67
|
+
error=ErrorData(
|
|
68
|
+
code=-32000,
|
|
69
|
+
message=f"Transport error: {type(exc).__name__}",
|
|
70
|
+
data={"error_type": type(exc).__name__, "detail": str(exc)},
|
|
71
|
+
),
|
|
72
|
+
)
|
|
73
|
+
await ctx.read_stream_writer.send(
|
|
74
|
+
SessionMessage(JSONRPCMessage(error_response))
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
await ctx.read_stream_writer.send(exc)
|
|
78
|
+
|
|
79
|
+
while True:
|
|
80
|
+
try:
|
|
81
|
+
if is_resumption:
|
|
82
|
+
await self._handle_resumption_request(ctx)
|
|
83
|
+
else:
|
|
84
|
+
await self._handle_post_request(ctx)
|
|
85
|
+
return
|
|
86
|
+
except retryable as e:
|
|
87
|
+
if time.monotonic() >= deadline:
|
|
88
|
+
logger.error("MCP request failed after timeout: %s", e)
|
|
89
|
+
await send_error_response(e)
|
|
90
|
+
return
|
|
91
|
+
logger.warning("Retrying MCP request after error: %s", e)
|
|
92
|
+
await asyncio.sleep(2.0)
|
|
93
|
+
except asyncio.CancelledError:
|
|
94
|
+
raise
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.exception("Request handler error: %s", e)
|
|
97
|
+
await send_error_response(e)
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
async with write_stream_reader:
|
|
102
|
+
async for session_message in write_stream_reader:
|
|
103
|
+
message = session_message.message
|
|
104
|
+
metadata = (
|
|
105
|
+
session_message.metadata
|
|
106
|
+
if isinstance(session_message.metadata, ClientMessageMetadata)
|
|
107
|
+
else None
|
|
108
|
+
)
|
|
109
|
+
is_resumption = bool(metadata and metadata.resumption_token)
|
|
110
|
+
|
|
111
|
+
logger.debug("Sending client message: %s", message)
|
|
112
|
+
|
|
113
|
+
if self._is_initialized_notification(message):
|
|
114
|
+
start_get_stream()
|
|
115
|
+
|
|
116
|
+
ctx = RequestContext(
|
|
117
|
+
client=client,
|
|
118
|
+
headers=self.request_headers,
|
|
119
|
+
session_id=self.session_id,
|
|
120
|
+
session_message=session_message,
|
|
121
|
+
metadata=metadata,
|
|
122
|
+
read_stream_writer=read_stream_writer,
|
|
123
|
+
sse_read_timeout=self.sse_read_timeout,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
if isinstance(message.root, JSONRPCRequest):
|
|
127
|
+
tg.start_soon(handle_request_async, ctx, is_resumption)
|
|
128
|
+
else:
|
|
129
|
+
await handle_request_async(ctx, is_resumption)
|
|
130
|
+
|
|
131
|
+
except Exception:
|
|
132
|
+
logger.exception("Error in post_writer")
|
|
133
|
+
finally:
|
|
134
|
+
await read_stream_writer.aclose()
|
|
135
|
+
await write_stream.aclose()
|
|
136
|
+
|
|
137
|
+
StreamableHTTPTransport.post_writer = patched_post_writer
|
|
138
|
+
logger.debug("Patched StreamableHTTPTransport.post_writer")
|
|
139
|
+
|
|
140
|
+
except ImportError:
|
|
141
|
+
logger.debug("mcp.client.streamable_http not available, skipping patch")
|
|
142
|
+
except Exception as e:
|
|
143
|
+
logger.warning("Failed to patch streamable_http: %s", e)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def patch_client_session_validation() -> None:
|
|
147
|
+
"""
|
|
148
|
+
Patch ClientSession to skip structured output validation.
|
|
149
|
+
|
|
150
|
+
The original validation is strict and raises errors for non-conforming
|
|
151
|
+
but usable responses. We replace it with a no-op.
|
|
152
|
+
"""
|
|
153
|
+
try:
|
|
154
|
+
from mcp.client.session import ClientSession
|
|
155
|
+
|
|
156
|
+
async def noop_validate(self: Any, name: str, result: Any) -> None:
|
|
157
|
+
"""Skip structured output validation entirely."""
|
|
158
|
+
|
|
159
|
+
ClientSession._validate_tool_result = noop_validate
|
|
160
|
+
logger.debug("Patched ClientSession._validate_tool_result to skip validation")
|
|
161
|
+
|
|
162
|
+
except ImportError:
|
|
163
|
+
logger.debug("mcp.client.session not available, skipping patch")
|
|
164
|
+
except Exception as e:
|
|
165
|
+
logger.warning("Failed to patch client session: %s", e)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def patch_server_output_validation() -> None:
|
|
169
|
+
"""
|
|
170
|
+
Patch MCP server to skip structured output validation and auto-generate
|
|
171
|
+
structuredContent for FastMCP tools with x-fastmcp-wrap-result.
|
|
172
|
+
"""
|
|
173
|
+
try:
|
|
174
|
+
import json
|
|
175
|
+
|
|
176
|
+
import mcp.types as types
|
|
177
|
+
from mcp.server.lowlevel.server import Server
|
|
178
|
+
|
|
179
|
+
def patched_call_tool(
|
|
180
|
+
self: Any, validate_input: bool = True, validate_output: bool = False
|
|
181
|
+
) -> Any:
|
|
182
|
+
"""Patched call_tool that skips output validation."""
|
|
183
|
+
|
|
184
|
+
def decorator(func: Any) -> Any:
|
|
185
|
+
async def handler(req: types.CallToolRequest) -> Any:
|
|
186
|
+
try:
|
|
187
|
+
tool_name = req.params.name
|
|
188
|
+
arguments = req.params.arguments or {}
|
|
189
|
+
tool = await self._get_cached_tool_definition(tool_name)
|
|
190
|
+
|
|
191
|
+
if validate_input and tool:
|
|
192
|
+
try:
|
|
193
|
+
import jsonschema
|
|
194
|
+
|
|
195
|
+
jsonschema.validate(instance=arguments, schema=tool.inputSchema)
|
|
196
|
+
except jsonschema.ValidationError as e:
|
|
197
|
+
return self._make_error_result(
|
|
198
|
+
f"Input validation error: {e.message}"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
results = await func(tool_name, arguments)
|
|
202
|
+
|
|
203
|
+
# output normalization
|
|
204
|
+
unstructured_content: list[Any]
|
|
205
|
+
maybe_structured_content: dict[str, Any] | None
|
|
206
|
+
if isinstance(results, types.CallToolResult):
|
|
207
|
+
return types.ServerResult(results)
|
|
208
|
+
elif isinstance(results, tuple) and len(results) == 2:
|
|
209
|
+
unstructured_content, maybe_structured_content = results
|
|
210
|
+
elif isinstance(results, dict):
|
|
211
|
+
maybe_structured_content = results
|
|
212
|
+
text = json.dumps(results, indent=2)
|
|
213
|
+
unstructured_content = [types.TextContent(type="text", text=text)]
|
|
214
|
+
elif results is None:
|
|
215
|
+
# None means success with no content
|
|
216
|
+
unstructured_content = []
|
|
217
|
+
maybe_structured_content = None
|
|
218
|
+
elif isinstance(results, (str, bytes, bytearray, memoryview)):
|
|
219
|
+
# Handle string/bytes explicitly before iterable check
|
|
220
|
+
# (these are iterable but should not be split into chars/ints)
|
|
221
|
+
if isinstance(results, str):
|
|
222
|
+
text = results
|
|
223
|
+
elif isinstance(results, memoryview):
|
|
224
|
+
text = bytes(results).decode("utf-8", errors="replace")
|
|
225
|
+
else:
|
|
226
|
+
text = bytes(results).decode("utf-8", errors="replace")
|
|
227
|
+
unstructured_content = [types.TextContent(type="text", text=text)]
|
|
228
|
+
maybe_structured_content = None
|
|
229
|
+
elif isinstance(results, (int, float, bool)):
|
|
230
|
+
# Primitives -> string representation
|
|
231
|
+
unstructured_content = [
|
|
232
|
+
types.TextContent(type="text", text=str(results))
|
|
233
|
+
]
|
|
234
|
+
maybe_structured_content = None
|
|
235
|
+
elif hasattr(results, "__iter__"):
|
|
236
|
+
unstructured_content = list(results)
|
|
237
|
+
maybe_structured_content = None
|
|
238
|
+
else:
|
|
239
|
+
return self._make_error_result(
|
|
240
|
+
f"Unexpected return type: {type(results).__name__}"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Auto-generate structuredContent for FastMCP tools
|
|
244
|
+
# FastMCP generates outputSchema but doesn't populate it
|
|
245
|
+
if maybe_structured_content is None and tool:
|
|
246
|
+
output_schema = getattr(tool, "outputSchema", None)
|
|
247
|
+
if output_schema and output_schema.get("x-fastmcp-wrap-result"):
|
|
248
|
+
for item in unstructured_content:
|
|
249
|
+
if isinstance(item, types.TextContent):
|
|
250
|
+
try:
|
|
251
|
+
parsed = json.loads(item.text)
|
|
252
|
+
maybe_structured_content = {"result": parsed}
|
|
253
|
+
except json.JSONDecodeError:
|
|
254
|
+
maybe_structured_content = {"result": item.text}
|
|
255
|
+
break
|
|
256
|
+
|
|
257
|
+
return types.ServerResult(
|
|
258
|
+
types.CallToolResult(
|
|
259
|
+
content=list(unstructured_content),
|
|
260
|
+
structuredContent=maybe_structured_content,
|
|
261
|
+
isError=False,
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
return self._make_error_result(str(e))
|
|
266
|
+
|
|
267
|
+
self.request_handlers[types.CallToolRequest] = handler
|
|
268
|
+
return func
|
|
269
|
+
|
|
270
|
+
return decorator
|
|
271
|
+
|
|
272
|
+
Server.call_tool = patched_call_tool
|
|
273
|
+
logger.debug("Patched Server.call_tool to skip output validation")
|
|
274
|
+
|
|
275
|
+
except ImportError:
|
|
276
|
+
logger.debug("mcp.server.lowlevel.server not available, skipping patch")
|
|
277
|
+
except Exception as e:
|
|
278
|
+
logger.warning("Failed to patch server output validation: %s", e)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def suppress_fastmcp_logging(level: int = logging.WARNING) -> None:
|
|
282
|
+
"""
|
|
283
|
+
Suppress verbose fastmcp logging.
|
|
284
|
+
|
|
285
|
+
FastMCP logs a lot of INFO-level messages that clutter output.
|
|
286
|
+
This sets all fastmcp loggers to the specified level.
|
|
287
|
+
|
|
288
|
+
Args:
|
|
289
|
+
level: Logging level to set (default: WARNING)
|
|
290
|
+
"""
|
|
291
|
+
loggers_to_suppress = [
|
|
292
|
+
"fastmcp",
|
|
293
|
+
"fastmcp.server.server",
|
|
294
|
+
"fastmcp.server.openapi",
|
|
295
|
+
"fastmcp.tools.tool_manager",
|
|
296
|
+
]
|
|
297
|
+
for logger_name in loggers_to_suppress:
|
|
298
|
+
logging.getLogger(logger_name).setLevel(level)
|
|
299
|
+
logger.debug("Suppressed fastmcp logging to level %s", level)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def apply_all_patches() -> None:
|
|
303
|
+
"""Apply all MCP patches."""
|
|
304
|
+
patch_streamable_http_error_handling()
|
|
305
|
+
patch_client_session_validation()
|
|
306
|
+
patch_server_output_validation()
|
|
307
|
+
suppress_fastmcp_logging()
|
|
308
|
+
logger.debug("All MCP patches applied")
|
hud/patches/warnings.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Centralized warning filters for noisy third-party dependencies.
|
|
3
|
+
|
|
4
|
+
Keep these helpers here so the rest of the codebase can stay clean and avoid
|
|
5
|
+
scattering warning filters across unrelated modules.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import warnings
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from collections.abc import Iterator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def apply_default_warning_filters(*, verbose: bool) -> None:
|
|
19
|
+
"""Apply our default warning filters for non-verbose CLI/server modes."""
|
|
20
|
+
if verbose:
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
|
24
|
+
|
|
25
|
+
# Pydantic v2 emits PydanticDeprecatedSince20 for v1-style config usage in deps.
|
|
26
|
+
try:
|
|
27
|
+
from pydantic.warnings import PydanticDeprecatedSince20
|
|
28
|
+
except Exception:
|
|
29
|
+
return
|
|
30
|
+
|
|
31
|
+
warnings.filterwarnings("ignore", category=PydanticDeprecatedSince20)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@contextmanager
|
|
35
|
+
def suppress_mcp_use_import_warnings() -> Iterator[None]:
|
|
36
|
+
"""Suppress known noisy warnings emitted during `mcp_use` imports."""
|
|
37
|
+
try:
|
|
38
|
+
from pydantic.warnings import PydanticDeprecatedSince20
|
|
39
|
+
except Exception: # pragma: no cover
|
|
40
|
+
PydanticDeprecatedSince20 = None # type: ignore[assignment]
|
|
41
|
+
|
|
42
|
+
with warnings.catch_warnings():
|
|
43
|
+
# mcp_use currently emits DeprecationWarning from its package __init__.py.
|
|
44
|
+
warnings.filterwarnings("ignore", category=DeprecationWarning, module=r"mcp_use(\..*)?$")
|
|
45
|
+
|
|
46
|
+
# mcp_use currently defines Pydantic v1-style `class Config` in oauth models.
|
|
47
|
+
if PydanticDeprecatedSince20 is not None:
|
|
48
|
+
warnings.filterwarnings(
|
|
49
|
+
"ignore",
|
|
50
|
+
category=PydanticDeprecatedSince20,
|
|
51
|
+
module=r"mcp_use\.client\.auth\.oauth$",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
yield
|
hud/samples/browser.py
CHANGED
|
@@ -7,17 +7,17 @@ from typing import Any
|
|
|
7
7
|
from pydantic import Field
|
|
8
8
|
|
|
9
9
|
from hud.settings import settings
|
|
10
|
-
from hud.types import
|
|
10
|
+
from hud.types import LegacyTask, MCPToolCall
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
class BrowserTask(
|
|
14
|
-
"""
|
|
13
|
+
class BrowserTask(LegacyTask):
|
|
14
|
+
"""LegacyTask subclass with browser defaults for BrowserTask(prompt=...)."""
|
|
15
15
|
|
|
16
16
|
prompt: str = "Open Google and be ready to search."
|
|
17
17
|
mcp_config: dict[str, Any] = Field(
|
|
18
18
|
default_factory=lambda: {
|
|
19
19
|
"browser": {
|
|
20
|
-
"url":
|
|
20
|
+
"url": settings.hud_mcp_url,
|
|
21
21
|
"headers": {
|
|
22
22
|
"Authorization": f"Bearer {settings.api_key}",
|
|
23
23
|
"Mcp-Image": "hudevals/hud-remote-browser:0.1.1",
|
hud/server/__init__.py
CHANGED
hud/server/low_level.py
CHANGED
|
@@ -89,11 +89,12 @@ class LowLevelServerWithInit(_BaseLL):
|
|
|
89
89
|
|
|
90
90
|
def __init__(
|
|
91
91
|
self,
|
|
92
|
+
fastmcp: Any,
|
|
92
93
|
*args: Any,
|
|
93
94
|
init_fn: Callable[[RequestContext], Awaitable[None]] | None = None,
|
|
94
95
|
**kwargs: Any,
|
|
95
96
|
) -> None:
|
|
96
|
-
super().__init__(*args, **kwargs)
|
|
97
|
+
super().__init__(fastmcp, *args, **kwargs)
|
|
97
98
|
self._init_fn = init_fn
|
|
98
99
|
|
|
99
100
|
async def run(
|