hud-python 0.4.23__tar.gz → 0.4.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- {hud_python-0.4.23 → hud_python-0.4.24}/PKG-INFO +1 -1
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/base.py +48 -20
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/claude.py +5 -1
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/mcp_use.py +19 -5
- hud_python-0.4.24/hud/clients/utils/__init__.py +26 -0
- hud_python-0.4.24/hud/clients/utils/retry.py +186 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/execution/parallel.py +71 -46
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_version.py +1 -1
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/version.py +1 -1
- {hud_python-0.4.23 → hud_python-0.4.24}/pyproject.toml +1 -1
- hud_python-0.4.23/hud/clients/utils/__init__.py +0 -1
- {hud_python-0.4.23 → hud_python-0.4.24}/.gitignore +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/LICENSE +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/2048/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/2048/backend/pyproject.toml +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/todo/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/todo/backend/pyproject.toml +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/pyproject.toml +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/remote_browser/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/remote_browser/pyproject.toml +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/remote_browser/src/hud_controller/providers/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/text_2048/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/environments/text_2048/pyproject.toml +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/examples/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/__main__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/grounded_openai.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/langchain.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/misc/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/misc/response_agent.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/openai.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/openai_chat_generic.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/test_base.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/test_claude.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/test_client.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/agents/tests/test_openai.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/__main__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/analyze.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/build.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/clone.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/debug.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/dev.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/eval.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/hf.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/list_func.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/pull.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/push.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/remove.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/pod.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/ssh.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/train.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/rl/utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_analyze.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_analyze_metadata.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_build.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_cli_init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_cli_main.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_clone.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_cursor.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_debug.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_list_func.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_main_module.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_mcp_server.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_pull.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_push.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_registry.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/tests/test_utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/cursor.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/docker.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/environment.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/interactive.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/logging.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/metadata.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/registry.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/remote_runner.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/runner.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/cli/utils/server.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/base.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/fastmcp.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/tests/test_client_integration.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/tests/test_fastmcp.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/tests/test_protocol.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/clients/utils/retry_transport.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/execution/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/execution/runner.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/task.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/datasets/utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/misc/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/misc/claude_plays_pokemon.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/native/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/native/comparator.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/native/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/native/tests/test_comparator.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/native/tests/test_native_init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/collector.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/config.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/context.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/exporters.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/instrumentation.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/processors.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/otel/tests/test_processors.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/py.typed +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/context.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/helper/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/low_level.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/server.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/server/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/settings.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/exceptions.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/hints.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/requests.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/tests/test_exceptions.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/shared/tests/test_requests.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/instrument.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/job.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/replay.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/tests/test_replay.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/tests/test_trace.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/telemetry/trace.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/base.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/bash.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/computer/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/computer/anthropic.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/computer/hud.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/computer/openai.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/computer/settings.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/edit.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/base.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/pyautogui.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/tests/test_base_executor.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/executors/xdo.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/config.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/grounded_tool.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/grounder.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/playwright.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/response.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/submit.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_base.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_bash.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_bash_extended.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_computer.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_computer_actions.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_edit.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_playwright_tool.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_response.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_tools.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_tools_init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/tests/test_utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/types.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/tools/utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/types.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/agent_factories.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/async_utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/hud_console.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/mcp.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/pretty_errors.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/progress.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/telemetry.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/__init__.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_async_utils.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_init.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_mcp.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_progress.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/hud/utils/tests/test_telemetry.py +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/rl/README.md +0 -0
- {hud_python-0.4.23 → hud_python-0.4.24}/rl/pyproject.toml +0 -0
|
@@ -207,6 +207,7 @@ class MCPAgent(ABC):
|
|
|
207
207
|
else:
|
|
208
208
|
raise TypeError(f"prompt_or_task must be str or Task, got {type(prompt_or_task)}")
|
|
209
209
|
except Exception as e:
|
|
210
|
+
# Always return a Trace object for any exception
|
|
210
211
|
if self._is_connection_error(e):
|
|
211
212
|
# Return error trace for connection failures
|
|
212
213
|
return Trace(
|
|
@@ -215,7 +216,15 @@ class MCPAgent(ABC):
|
|
|
215
216
|
content=self._get_connection_error_message(e),
|
|
216
217
|
isError=True,
|
|
217
218
|
)
|
|
218
|
-
|
|
219
|
+
else:
|
|
220
|
+
# Return error trace for any other exception
|
|
221
|
+
return Trace(
|
|
222
|
+
reward=0.0,
|
|
223
|
+
done=True,
|
|
224
|
+
content=f"Task failed with error: {e}",
|
|
225
|
+
isError=True,
|
|
226
|
+
info={"error": str(e)},
|
|
227
|
+
)
|
|
219
228
|
finally:
|
|
220
229
|
# Cleanup auto-created resources
|
|
221
230
|
await self._cleanup()
|
|
@@ -262,34 +271,53 @@ class MCPAgent(ABC):
|
|
|
262
271
|
prompt_result = Trace(reward=0.0, done=True, content=str(e), isError=True)
|
|
263
272
|
prompt_result.populate_from_context()
|
|
264
273
|
|
|
265
|
-
# Always evaluate if we have
|
|
266
|
-
if
|
|
274
|
+
# Always evaluate if we have evaluate tool, regardless of errors
|
|
275
|
+
if task.evaluate_tool is not None:
|
|
267
276
|
try:
|
|
268
277
|
self.console.progress_log(f"Evaluating tool phase: {task.evaluate_tool}")
|
|
269
278
|
results = await self.call_tools(task.evaluate_tool)
|
|
270
279
|
|
|
271
280
|
if any(result.isError for result in results):
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
281
|
+
self.console.warning_log(f"Evaluate tool returned error: {results}")
|
|
282
|
+
# Still extract what we can from the error response
|
|
283
|
+
if prompt_result is None:
|
|
284
|
+
prompt_result = Trace(
|
|
285
|
+
reward=0.0,
|
|
286
|
+
done=True,
|
|
287
|
+
content="Task failed before evaluation",
|
|
288
|
+
isError=True,
|
|
289
|
+
)
|
|
290
|
+
prompt_result.reward = 0.0 # Default to 0 on error
|
|
291
|
+
else:
|
|
292
|
+
# Extract reward and content from evaluation
|
|
293
|
+
if results:
|
|
294
|
+
reward = find_reward(results[0])
|
|
295
|
+
eval_content = find_content(results[0])
|
|
296
|
+
|
|
297
|
+
# Update the prompt result with evaluation reward
|
|
298
|
+
if prompt_result is None:
|
|
299
|
+
prompt_result = Trace(
|
|
300
|
+
reward=reward, done=True, content=eval_content or "", isError=False
|
|
301
|
+
)
|
|
287
302
|
else:
|
|
288
|
-
prompt_result.
|
|
303
|
+
prompt_result.reward = reward
|
|
304
|
+
|
|
305
|
+
# Update the prompt result with evaluation content (if available)
|
|
306
|
+
if eval_content:
|
|
307
|
+
# Prompt result may already have final response content,
|
|
308
|
+
# so we append to it
|
|
309
|
+
if prompt_result.content:
|
|
310
|
+
prompt_result.content += "\n\n" + eval_content
|
|
311
|
+
else:
|
|
312
|
+
prompt_result.content = eval_content
|
|
289
313
|
|
|
290
314
|
except Exception as e:
|
|
291
315
|
self.console.error_log(f"Evaluation phase failed: {e}")
|
|
292
|
-
#
|
|
316
|
+
# Ensure we have a result even if evaluation failed
|
|
317
|
+
if prompt_result is None:
|
|
318
|
+
prompt_result = Trace(
|
|
319
|
+
reward=0.0, done=True, content=f"Evaluation failed: {e}", isError=True
|
|
320
|
+
)
|
|
293
321
|
|
|
294
322
|
return (
|
|
295
323
|
prompt_result
|
|
@@ -196,7 +196,11 @@ class ClaudeAgent(MCPAgent):
|
|
|
196
196
|
response = await self.anthropic_client.beta.messages.create(**create_kwargs)
|
|
197
197
|
break
|
|
198
198
|
except BadRequestError as e:
|
|
199
|
-
if
|
|
199
|
+
if (
|
|
200
|
+
"prompt is too long" in str(e)
|
|
201
|
+
or "request_too_large" in str(e)
|
|
202
|
+
or e.status_code == 413
|
|
203
|
+
):
|
|
200
204
|
logger.warning("Prompt too long, truncating message history")
|
|
201
205
|
# Keep first message and last 20 messages
|
|
202
206
|
if len(current_messages) > 21:
|
|
@@ -15,6 +15,7 @@ from hud.types import MCPToolCall, MCPToolResult
|
|
|
15
15
|
from hud.version import __version__ as hud_version
|
|
16
16
|
|
|
17
17
|
from .base import BaseHUDClient
|
|
18
|
+
from .utils.retry import retry_with_backoff
|
|
18
19
|
|
|
19
20
|
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
@@ -127,8 +128,11 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
127
128
|
logger.warning("Client session not initialized for %s", server_name)
|
|
128
129
|
continue
|
|
129
130
|
|
|
130
|
-
# List tools
|
|
131
|
-
tools_result = await
|
|
131
|
+
# List tools with retry logic for HTTP errors
|
|
132
|
+
tools_result = await retry_with_backoff(
|
|
133
|
+
session.connector.client_session.list_tools,
|
|
134
|
+
operation_name=f"list_tools_{server_name}",
|
|
135
|
+
)
|
|
132
136
|
|
|
133
137
|
logger.info(
|
|
134
138
|
"Discovered %d tools from '%s': %s",
|
|
@@ -202,9 +206,12 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
202
206
|
if session.connector.client_session is None:
|
|
203
207
|
raise ValueError(f"Client session not initialized for {server_name}")
|
|
204
208
|
|
|
205
|
-
|
|
209
|
+
# Call tool with retry logic for HTTP errors (502, 503, 504)
|
|
210
|
+
result = await retry_with_backoff(
|
|
211
|
+
session.connector.client_session.call_tool,
|
|
206
212
|
name=original_tool.name, # Use original tool name, not prefixed
|
|
207
213
|
arguments=tool_call.arguments or {},
|
|
214
|
+
operation_name=f"call_tool_{original_tool.name}",
|
|
208
215
|
)
|
|
209
216
|
|
|
210
217
|
if self.verbose:
|
|
@@ -232,7 +239,10 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
232
239
|
continue
|
|
233
240
|
# Prefer standard method name if available
|
|
234
241
|
if hasattr(session.connector.client_session, "list_resources"):
|
|
235
|
-
resources = await
|
|
242
|
+
resources = await retry_with_backoff(
|
|
243
|
+
session.connector.client_session.list_resources,
|
|
244
|
+
operation_name=f"list_resources_{server_name}",
|
|
245
|
+
)
|
|
236
246
|
else:
|
|
237
247
|
# If the client doesn't support resource listing, skip
|
|
238
248
|
continue
|
|
@@ -262,7 +272,11 @@ class MCPUseHUDClient(BaseHUDClient):
|
|
|
262
272
|
resource_uri = AnyUrl(uri) if isinstance(uri, str) else uri
|
|
263
273
|
# Prefer read_resource; fall back to list_resources if needed
|
|
264
274
|
if hasattr(session.connector.client_session, "read_resource"):
|
|
265
|
-
result = await
|
|
275
|
+
result = await retry_with_backoff(
|
|
276
|
+
session.connector.client_session.read_resource,
|
|
277
|
+
resource_uri,
|
|
278
|
+
operation_name=f"read_resource_{server_name}",
|
|
279
|
+
)
|
|
266
280
|
else:
|
|
267
281
|
# Fallback path for older clients: not supported in strict typing
|
|
268
282
|
raise AttributeError("read_resource not available")
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""HUD MCP client utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .retry import (
|
|
6
|
+
DEFAULT_BACKOFF_FACTOR,
|
|
7
|
+
DEFAULT_MAX_RETRIES,
|
|
8
|
+
DEFAULT_RETRY_DELAY,
|
|
9
|
+
DEFAULT_RETRY_STATUS_CODES,
|
|
10
|
+
is_retryable_error,
|
|
11
|
+
retry_with_backoff,
|
|
12
|
+
with_retry,
|
|
13
|
+
)
|
|
14
|
+
from .retry_transport import RetryTransport, create_retry_httpx_client
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"DEFAULT_BACKOFF_FACTOR",
|
|
18
|
+
"DEFAULT_MAX_RETRIES",
|
|
19
|
+
"DEFAULT_RETRY_DELAY",
|
|
20
|
+
"DEFAULT_RETRY_STATUS_CODES",
|
|
21
|
+
"RetryTransport",
|
|
22
|
+
"create_retry_httpx_client",
|
|
23
|
+
"is_retryable_error",
|
|
24
|
+
"retry_with_backoff",
|
|
25
|
+
"with_retry",
|
|
26
|
+
]
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Shared retry utilities for MCP client operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
|
|
13
|
+
from httpx import HTTPStatusError
|
|
14
|
+
from mcp.shared.exceptions import McpError
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
T = TypeVar("T")
|
|
19
|
+
|
|
20
|
+
# Default retry configuration matching requests.py
|
|
21
|
+
DEFAULT_MAX_RETRIES = 4
|
|
22
|
+
DEFAULT_RETRY_DELAY = 2.0
|
|
23
|
+
DEFAULT_RETRY_STATUS_CODES = {502, 503, 504}
|
|
24
|
+
DEFAULT_BACKOFF_FACTOR = 2.0
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_retryable_error(error: Exception, retry_status_codes: set[int]) -> bool:
|
|
28
|
+
"""
|
|
29
|
+
Check if an error is retryable based on status codes.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
error: The exception to check
|
|
33
|
+
retry_status_codes: Set of HTTP status codes to retry on
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
True if the error is retryable, False otherwise
|
|
37
|
+
"""
|
|
38
|
+
# Check for HTTP status errors with retryable status codes
|
|
39
|
+
if isinstance(error, HTTPStatusError):
|
|
40
|
+
return error.response.status_code in retry_status_codes
|
|
41
|
+
|
|
42
|
+
# Check for MCP errors that might wrap HTTP errors
|
|
43
|
+
if isinstance(error, McpError):
|
|
44
|
+
error_msg = str(error).lower()
|
|
45
|
+
# Check for common gateway error patterns in the message
|
|
46
|
+
for code in retry_status_codes:
|
|
47
|
+
if str(code) in error_msg:
|
|
48
|
+
return True
|
|
49
|
+
# Check for gateway error keywords
|
|
50
|
+
if any(
|
|
51
|
+
keyword in error_msg
|
|
52
|
+
for keyword in ["bad gateway", "service unavailable", "gateway timeout"]
|
|
53
|
+
):
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
# Check for generic errors with status codes in the message
|
|
57
|
+
error_msg = str(error)
|
|
58
|
+
for code in retry_status_codes:
|
|
59
|
+
if f"{code}" in error_msg or f"status {code}" in error_msg.lower():
|
|
60
|
+
return True
|
|
61
|
+
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
async def retry_with_backoff(
|
|
66
|
+
func: Callable[..., Any],
|
|
67
|
+
*args: Any,
|
|
68
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
69
|
+
retry_delay: float = DEFAULT_RETRY_DELAY,
|
|
70
|
+
retry_status_codes: set[int] | None = None,
|
|
71
|
+
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
|
|
72
|
+
operation_name: str | None = None,
|
|
73
|
+
**kwargs: Any,
|
|
74
|
+
) -> Any:
|
|
75
|
+
"""
|
|
76
|
+
Execute an async function with retry logic and exponential backoff.
|
|
77
|
+
|
|
78
|
+
This matches the retry behavior in requests.py but can be applied
|
|
79
|
+
to any async function, particularly MCP client operations.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
func: The async function to retry
|
|
83
|
+
*args: Positional arguments for the function
|
|
84
|
+
max_retries: Maximum number of retry attempts
|
|
85
|
+
retry_delay: Initial delay between retries in seconds
|
|
86
|
+
retry_status_codes: HTTP status codes to retry on
|
|
87
|
+
backoff_factor: Multiplier for exponential backoff
|
|
88
|
+
operation_name: Name of the operation for logging
|
|
89
|
+
**kwargs: Keyword arguments for the function
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
The result of the function call
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
The last exception if all retries are exhausted
|
|
96
|
+
"""
|
|
97
|
+
if retry_status_codes is None:
|
|
98
|
+
retry_status_codes = DEFAULT_RETRY_STATUS_CODES
|
|
99
|
+
|
|
100
|
+
operation = operation_name or func.__name__
|
|
101
|
+
last_error = None
|
|
102
|
+
|
|
103
|
+
for attempt in range(max_retries + 1):
|
|
104
|
+
try:
|
|
105
|
+
result = await func(*args, **kwargs)
|
|
106
|
+
return result
|
|
107
|
+
except Exception as e:
|
|
108
|
+
last_error = e
|
|
109
|
+
|
|
110
|
+
# Check if this is a retryable error
|
|
111
|
+
if not is_retryable_error(e, retry_status_codes):
|
|
112
|
+
# Not retryable, raise immediately
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
# Don't retry if we've exhausted attempts
|
|
116
|
+
if attempt >= max_retries:
|
|
117
|
+
logger.debug(
|
|
118
|
+
"Operation '%s' failed after %d retries: %s",
|
|
119
|
+
operation,
|
|
120
|
+
max_retries,
|
|
121
|
+
e,
|
|
122
|
+
)
|
|
123
|
+
raise
|
|
124
|
+
|
|
125
|
+
# Calculate backoff delay (exponential backoff)
|
|
126
|
+
delay = retry_delay * (backoff_factor**attempt)
|
|
127
|
+
|
|
128
|
+
logger.warning(
|
|
129
|
+
"Operation '%s' failed with retryable error, "
|
|
130
|
+
"retrying in %.2f seconds (attempt %d/%d): %s",
|
|
131
|
+
operation,
|
|
132
|
+
delay,
|
|
133
|
+
attempt + 1,
|
|
134
|
+
max_retries,
|
|
135
|
+
e,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
await asyncio.sleep(delay)
|
|
139
|
+
|
|
140
|
+
# This should never be reached, but just in case
|
|
141
|
+
if last_error:
|
|
142
|
+
raise last_error
|
|
143
|
+
raise RuntimeError(f"Unexpected retry loop exit for operation '{operation}'")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def with_retry(
|
|
147
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
148
|
+
retry_delay: float = DEFAULT_RETRY_DELAY,
|
|
149
|
+
retry_status_codes: set[int] | None = None,
|
|
150
|
+
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
|
|
151
|
+
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
152
|
+
"""
|
|
153
|
+
Decorator to add retry logic to async methods.
|
|
154
|
+
|
|
155
|
+
Usage:
|
|
156
|
+
@with_retry(max_retries=3)
|
|
157
|
+
async def my_method(self, ...):
|
|
158
|
+
...
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
max_retries: Maximum number of retry attempts
|
|
162
|
+
retry_delay: Initial delay between retries
|
|
163
|
+
retry_status_codes: HTTP status codes to retry on
|
|
164
|
+
backoff_factor: Multiplier for exponential backoff
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Decorated function with retry logic
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
171
|
+
@wraps(func)
|
|
172
|
+
async def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
173
|
+
return await retry_with_backoff(
|
|
174
|
+
func,
|
|
175
|
+
*args,
|
|
176
|
+
max_retries=max_retries,
|
|
177
|
+
retry_delay=retry_delay,
|
|
178
|
+
retry_status_codes=retry_status_codes,
|
|
179
|
+
backoff_factor=backoff_factor,
|
|
180
|
+
operation_name=func.__name__,
|
|
181
|
+
**kwargs,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return wrapper
|
|
185
|
+
|
|
186
|
+
return decorator
|
|
@@ -114,36 +114,58 @@ def _process_worker(
|
|
|
114
114
|
task_name = task_dict.get("prompt") or f"Task {index}"
|
|
115
115
|
|
|
116
116
|
# Use the job_id to group all tasks under the same job
|
|
117
|
-
with hud.trace(
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
agent
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
117
|
+
with hud.trace(
|
|
118
|
+
task_name, job_id=job_id, task_id=task_dict.get("id")
|
|
119
|
+
):
|
|
120
|
+
try:
|
|
121
|
+
# Convert dict to Task
|
|
122
|
+
task = Task(**task_dict)
|
|
123
|
+
|
|
124
|
+
# Create agent instance
|
|
125
|
+
agent = agent_class(**(agent_config or {}))
|
|
126
|
+
|
|
127
|
+
if auto_respond:
|
|
128
|
+
agent.response_agent = ResponseAgent()
|
|
129
|
+
|
|
130
|
+
# Run the task - this should ALWAYS return a result, even on error
|
|
131
|
+
result = await agent.run(task, max_steps=max_steps)
|
|
132
|
+
|
|
133
|
+
# Extract and print evaluation score for visibility
|
|
134
|
+
reward = getattr(result, "reward", "N/A")
|
|
135
|
+
logger.info(
|
|
136
|
+
"[Worker %s] Task %s: ✓ Completed (reward: %s)",
|
|
137
|
+
worker_id,
|
|
138
|
+
index,
|
|
139
|
+
reward,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info(
|
|
143
|
+
"[Worker %s] Completed task %s (reward: %s)",
|
|
144
|
+
worker_id,
|
|
145
|
+
index,
|
|
146
|
+
reward,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
return (index, result)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
# Even if there's an exception, ensure we have a proper result
|
|
152
|
+
logger.error(
|
|
153
|
+
"[Worker %s] Task %s failed during execution: %s",
|
|
154
|
+
worker_id,
|
|
155
|
+
index,
|
|
156
|
+
str(e)[:200],
|
|
157
|
+
)
|
|
158
|
+
# Create a proper Trace result for errors
|
|
159
|
+
from hud.types import Trace
|
|
160
|
+
|
|
161
|
+
error_result = Trace(
|
|
162
|
+
reward=0.0,
|
|
163
|
+
done=True,
|
|
164
|
+
content=f"Task execution failed: {e}",
|
|
165
|
+
isError=True,
|
|
166
|
+
info={"error": str(e), "traceback": traceback.format_exc()},
|
|
167
|
+
)
|
|
168
|
+
return (index, error_result)
|
|
147
169
|
|
|
148
170
|
except Exception as e:
|
|
149
171
|
error_msg = f"Worker {worker_id}: Task {index} failed: {e}"
|
|
@@ -190,22 +212,6 @@ def _process_worker(
|
|
|
190
212
|
try:
|
|
191
213
|
# Run the async batch processing
|
|
192
214
|
results = loop.run_until_complete(process_batch())
|
|
193
|
-
|
|
194
|
-
# CRITICAL: Ensure telemetry is fully sent before process exits
|
|
195
|
-
# Two things need to complete:
|
|
196
|
-
# 1. The trace context's __exit__ already called _update_task_status_sync (blocking)
|
|
197
|
-
# 2. But spans are buffered in BatchSpanProcessor and need explicit flush
|
|
198
|
-
|
|
199
|
-
from opentelemetry import trace as otel_trace
|
|
200
|
-
|
|
201
|
-
provider = otel_trace.get_tracer_provider()
|
|
202
|
-
if provider and hasattr(provider, "force_flush"):
|
|
203
|
-
# This forces BatchSpanProcessor to export all buffered spans NOW
|
|
204
|
-
# The method returns True if successful, False if timeout
|
|
205
|
-
success = provider.force_flush(timeout_millis=5000) # 5 second timeout # type: ignore
|
|
206
|
-
if not success:
|
|
207
|
-
logger.warning("Worker %s: Telemetry flush timed out", worker_id)
|
|
208
|
-
|
|
209
215
|
return results
|
|
210
216
|
except KeyboardInterrupt:
|
|
211
217
|
logger.info("Worker %s: Interrupted by user, stopping gracefully", worker_id)
|
|
@@ -230,6 +236,25 @@ def _process_worker(
|
|
|
230
236
|
logger.error("Worker %s batch processing failed: %s", worker_id, e)
|
|
231
237
|
return [(idx, {"error": str(e), "isError": True}) for idx, _ in task_batch]
|
|
232
238
|
finally:
|
|
239
|
+
# CRITICAL: Always ensure telemetry is fully sent before process exits
|
|
240
|
+
# This must happen in finally block to ensure it runs even on errors
|
|
241
|
+
try:
|
|
242
|
+
from opentelemetry import trace as otel_trace
|
|
243
|
+
|
|
244
|
+
provider = otel_trace.get_tracer_provider()
|
|
245
|
+
if provider and hasattr(provider, "force_flush"):
|
|
246
|
+
# This forces BatchSpanProcessor to export all buffered spans NOW
|
|
247
|
+
# The method returns True if successful, False if timeout
|
|
248
|
+
success = provider.force_flush(
|
|
249
|
+
timeout_millis=10000
|
|
250
|
+
) # 10 second timeout # type: ignore
|
|
251
|
+
if not success:
|
|
252
|
+
logger.warning("Worker %s: Telemetry flush timed out", worker_id)
|
|
253
|
+
else:
|
|
254
|
+
logger.debug("Worker %s: Telemetry flushed successfully", worker_id)
|
|
255
|
+
except Exception as flush_error:
|
|
256
|
+
logger.error("Worker %s: Failed to flush telemetry: %s", worker_id, flush_error)
|
|
257
|
+
|
|
233
258
|
# Clean up the event loop
|
|
234
259
|
try:
|
|
235
260
|
loop.close()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
"""HUD MCP client utilities."""
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/2048/backend/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{hud_python-0.4.23 → hud_python-0.4.24}/environments/browser/apps/todo/backend/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|