PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (282) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +70 -5
hud/agents/base.py +238 -500
hud/agents/claude.py +236 -247
hud/agents/gateway.py +42 -0
hud/agents/gemini.py +264 -0
hud/agents/gemini_cua.py +324 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +48 -36
hud/agents/openai.py +282 -296
hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
hud/agents/operator.py +199 -0
hud/agents/resolver.py +70 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +381 -214
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +377 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_resolver.py +192 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/agents/types.py +148 -0
hud/cli/__init__.py +493 -546
hud/cli/analyze.py +43 -5
hud/cli/build.py +699 -113
hud/cli/debug.py +8 -5
hud/cli/dev.py +889 -732
hud/cli/eval.py +793 -667
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/pull.py +1 -1
hud/cli/push.py +38 -13
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +110 -8
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push.py +1 -1
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +70 -1
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +45 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +326 -0
hud/datasets/runner.py +198 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +52 -0
hud/environment/connection.py +258 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +137 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +835 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +263 -0
hud/environment/scenarios.py +620 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +205 -0
hud/environment/tests/test_environment.py +593 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +242 -0
hud/environment/tests/test_scenarios.py +1086 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +727 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +187 -0
hud/eval/manager.py +533 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +372 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +291 -0
hud/eval/types.py +65 -0
hud/eval/utils.py +194 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +308 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +165 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +18 -2
hud/tools/agent.py +223 -0
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +36 -3
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_agent_tool.py +355 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +194 -56
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +89 -18
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.13.dist-info/METADATA +264 -0
hud_python-0.5.13.dist-info/RECORD +305 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0

hud/agents/tests/test_base_runtime.py ADDED Viewed

@@ -0,0 +1,233 @@
+"""Runtime tests for MCPAgent base class."""
+from __future__ import annotations
+from typing import Any
+import mcp.types as types
+import pytest
+from hud.agents.base import BaseCreateParams, MCPAgent, find_content, find_reward, text_to_blocks
+from hud.environment.router import ToolRouter
+from hud.eval.context import EvalContext
+from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
+class DummyConfig(BaseAgentConfig):
+    model_name: str = "DummyAgent"
+    model: str = "dummy-model"
+class DummyCreateParams(BaseCreateParams, DummyConfig):
+    pass
+class MockEvalContext(EvalContext):
+    """Mock EvalContext for testing."""
+    def __init__(
+        self,
+        prompt: str = "Test prompt",
+        tools: list[types.Tool] | None = None,
+    ) -> None:
+        # Core attributes
+        self.prompt = prompt
+        self._tools = tools or []
+        self._submitted: str | None = None
+        self.reward: float | None = None
+        self._call_tool_handler: Any = None
+        # Environment attributes
+        self._router = ToolRouter()
+        self._agent_include: list[str] | None = None
+        self._agent_exclude: list[str] | None = None
+        # EvalContext attributes
+        self._task = None
+        self.trace_id = "test-trace-id"
+        self.eval_name = "test-eval"
+        self.job_id: str | None = None
+        self.group_id: str | None = None
+        self.index = 0
+        self.variants: dict[str, Any] = {}
+        self.answer: str | None = None
+        self.system_prompt: str | None = None
+        self.error: BaseException | None = None
+        self.metadata: dict[str, Any] = {}
+        self.results: list[Any] = []
+        self._is_summary = False
+    def as_tools(self) -> list[types.Tool]:
+        return self._tools
+    @property
+    def has_scenario(self) -> bool:
+        return False
+    def set_call_tool_handler(self, handler: Any) -> None:
+        self._call_tool_handler = handler
+    async def list_tools(self) -> list[types.Tool]:
+        return self._tools
+    async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
+        if self._call_tool_handler:
+            # Parse the call
+            if isinstance(call, tuple):
+                tc = MCPToolCall(name=call[0], arguments=call[1] if len(call) > 1 else {})
+            elif hasattr(call, "name"):
+                tc = call
+            else:
+                tc = MCPToolCall(name=str(call), arguments=kwargs)
+            return self._call_tool_handler(tc)
+        return MCPToolResult(
+            content=[types.TextContent(type="text", text="ok")],
+            isError=False,
+        )
+    async def submit(self, answer: str) -> None:
+        self._submitted = answer
+class DummyAgent(MCPAgent):
+    config_cls = DummyConfig
+    def __init__(self, **kwargs: Any) -> None:
+        params = DummyCreateParams(**kwargs)
+        super().__init__(params)
+    async def get_system_messages(self) -> list[types.ContentBlock]:
+        return [types.TextContent(type="text", text="sys")]
+    async def get_response(self, messages: list[Any]) -> AgentResponse:
+        return AgentResponse(content="ok", tool_calls=[], done=True)
+    async def format_blocks(self, blocks: list[Any]) -> list[Any]:
+        return blocks
+    async def format_tool_results(
+        self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
+    ) -> list[Any]:
+        return [types.TextContent(text="tools", type="text")]
+def test_find_reward_and_content_extractors() -> None:
+    """Test reward and content extraction from tool results."""
+    # Structured content
+    r = MCPToolResult(
+        content=text_to_blocks("{}"), isError=False, structuredContent={"reward": 0.7}
+    )
+    assert find_reward(r) == 0.7
+    # Text JSON
+    r2 = MCPToolResult(content=text_to_blocks('{"score": 0.5, "content": "hi"}'), isError=False)
+    assert find_reward(r2) == 0.5
+    assert find_content(r2) == "hi"
+def test_get_available_tools_before_run_raises() -> None:
+    """Test that get_available_tools raises before initialization."""
+    agent = DummyAgent()
+    with pytest.raises(RuntimeError):
+        agent.get_available_tools()
+@pytest.mark.asyncio
+async def test_format_message_invalid_type_raises() -> None:
+    """Test that format_message raises for invalid types."""
+    agent = DummyAgent()
+    with pytest.raises(ValueError):
+        await agent.format_message({"oops": 1})  # type: ignore
+def test_text_to_blocks_shapes() -> None:
+    """Test text_to_blocks returns correct structure."""
+    blocks = text_to_blocks("x")
+    assert isinstance(blocks, list) and blocks and isinstance(blocks[0], types.TextContent)
+@pytest.mark.asyncio
+async def test_run_with_eval_context() -> None:
+    """Test basic run() with EvalContext."""
+    ctx = MockEvalContext(prompt="hello")
+    agent = DummyAgent()
+    result = await agent.run(ctx, max_steps=1)
+    assert result.done is True
+    assert result.isError is False
+@pytest.mark.asyncio
+async def test_run_requires_eval_context() -> None:
+    """Test run() raises TypeError for non-EvalContext."""
+    agent = DummyAgent()
+    with pytest.raises(TypeError, match="must be EvalContext"):
+        await agent.run("hello")  # type: ignore
+@pytest.mark.asyncio
+async def test_run_requires_prompt() -> None:
+    """Test run() raises ValueError when prompt is empty."""
+    ctx = MockEvalContext(prompt="")
+    agent = DummyAgent()
+    with pytest.raises(ValueError, match="prompt is not set"):
+        await agent.run(ctx)
+@pytest.mark.asyncio
+async def test_call_tools_error_paths() -> None:
+    """Test call_tools handles errors correctly."""
+    call_count = [0]
+    ok_result = MCPToolResult(content=text_to_blocks("ok"), isError=False)
+    def handler(tool_call: MCPToolCall) -> MCPToolResult:
+        call_count[0] += 1
+        if call_count[0] == 1:
+            return ok_result
+        raise RuntimeError("boom")
+    ctx = MockEvalContext(prompt="test")
+    ctx.set_call_tool_handler(handler)
+    agent = DummyAgent()
+    # Initialize the agent with context
+    agent.ctx = ctx
+    await agent._initialize_from_ctx(ctx)
+    results = await agent.call_tools(
+        [MCPToolCall(name="a", arguments={}), MCPToolCall(name="b", arguments={})]
+    )
+    assert results[0].isError is False
+    assert results[1].isError is True
+@pytest.mark.asyncio
+async def test_call_tools_timeout_raises() -> None:
+    """Test call_tools raises TimeoutError."""
+    def handler(tool_call: MCPToolCall) -> MCPToolResult:
+        raise TimeoutError("timeout")
+    ctx = MockEvalContext(prompt="test")
+    ctx.set_call_tool_handler(handler)
+    agent = DummyAgent()
+    agent.ctx = ctx
+    await agent._initialize_from_ctx(ctx)
+    with pytest.raises(TimeoutError):
+        await agent.call_tools(MCPToolCall(name="x", arguments={}))
+@pytest.mark.asyncio
+async def test_get_available_tools_after_run() -> None:
+    """Test get_available_tools works after initialization."""
+    tools = [types.Tool(name="test_tool", description="Test", inputSchema={})]
+    ctx = MockEvalContext(prompt="hello", tools=tools)
+    agent = DummyAgent()
+    # Run initializes the agent
+    await agent.run(ctx, max_steps=1)
+    # After cleanup, we can't access tools (ctx is cleared)
+    # But during run, tools were available
+    assert agent._initialized is True

hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.13py3-none-any.whl