hud-python 0.5.0__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (301) hide show
  1. {hud_python-0.5.0 → hud_python-0.5.1}/PKG-INFO +27 -14
  2. {hud_python-0.5.0 → hud_python-0.5.1}/README.md +26 -13
  3. {hud_python-0.5.0 → hud_python-0.5.1}/hud/__init__.py +1 -1
  4. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/base.py +26 -2
  5. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/response_agent.py +5 -1
  6. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/openai_chat.py +12 -0
  7. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_base.py +64 -0
  8. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/eval.py +56 -27
  9. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/init.py +4 -0
  10. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/runner.py +4 -3
  11. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/utils.py +7 -0
  12. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/remote.py +3 -4
  13. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/environment.py +11 -3
  14. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/scenarios.py +46 -9
  15. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/context.py +26 -23
  16. hud_python-0.5.1/hud/eval/instrument.py +185 -0
  17. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/hud_console.py +7 -3
  18. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_version.py +1 -1
  19. {hud_python-0.5.0 → hud_python-0.5.1}/hud/version.py +1 -1
  20. {hud_python-0.5.0 → hud_python-0.5.1}/pyproject.toml +1 -1
  21. hud_python-0.5.0/hud/eval/instrument.py +0 -115
  22. {hud_python-0.5.0 → hud_python-0.5.1}/.gitignore +0 -0
  23. {hud_python-0.5.0 → hud_python-0.5.1}/LICENSE +0 -0
  24. {hud_python-0.5.0 → hud_python-0.5.1}/examples/README.md +0 -0
  25. {hud_python-0.5.0 → hud_python-0.5.1}/hud/__main__.py +0 -0
  26. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/__init__.py +0 -0
  27. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/claude.py +0 -0
  28. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/gemini.py +0 -0
  29. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/gemini_cua.py +0 -0
  30. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/grounded_openai.py +0 -0
  31. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/__init__.py +0 -0
  32. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/misc/integration_test_agent.py +0 -0
  33. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/openai.py +0 -0
  34. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/operator.py +0 -0
  35. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/__init__.py +0 -0
  36. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/conftest.py +0 -0
  37. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_base_runtime.py +0 -0
  38. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_claude.py +0 -0
  39. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_client.py +0 -0
  40. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_gemini.py +0 -0
  41. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  42. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_openai.py +0 -0
  43. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_operator.py +0 -0
  44. {hud_python-0.5.0 → hud_python-0.5.1}/hud/agents/tests/test_run_eval.py +0 -0
  45. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/__init__.py +0 -0
  46. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/__main__.py +0 -0
  47. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/analyze.py +0 -0
  48. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/build.py +0 -0
  49. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/clone.py +0 -0
  50. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/debug.py +0 -0
  51. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/dev.py +0 -0
  52. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/__init__.py +0 -0
  53. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/dev.py +0 -0
  54. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/init.py +0 -0
  55. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tasks.py +0 -0
  56. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/templates.py +0 -0
  57. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tests/__init__.py +0 -0
  58. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/flows/tests/test_dev.py +0 -0
  59. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/get.py +0 -0
  60. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/list_func.py +0 -0
  61. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/pull.py +0 -0
  62. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/push.py +0 -0
  63. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/remove.py +0 -0
  64. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/rft.py +0 -0
  65. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/rft_status.py +0 -0
  66. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/__init__.py +0 -0
  67. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze.py +0 -0
  68. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze_metadata.py +0 -0
  69. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_analyze_module.py +0 -0
  70. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build.py +0 -0
  71. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build_failure.py +0 -0
  72. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_build_module.py +0 -0
  73. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_init.py +0 -0
  74. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_main.py +0 -0
  75. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  76. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cli_root.py +0 -0
  77. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_clone.py +0 -0
  78. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_convert.py +0 -0
  79. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_cursor.py +0 -0
  80. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_debug.py +0 -0
  81. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_dev.py +0 -0
  82. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_eval.py +0 -0
  83. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_eval_bedrock.py +0 -0
  84. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_init.py +0 -0
  85. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_list_func.py +0 -0
  86. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_main_module.py +0 -0
  87. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_mcp_server.py +0 -0
  88. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_pull.py +0 -0
  89. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push.py +0 -0
  90. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push_happy.py +0 -0
  91. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_push_wrapper.py +0 -0
  92. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_registry.py +0 -0
  93. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/tests/test_utils.py +0 -0
  94. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/__init__.py +0 -0
  95. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/celebrate.py +0 -0
  96. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/config.py +0 -0
  97. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/cursor.py +0 -0
  98. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/docker.py +0 -0
  99. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/env_check.py +0 -0
  100. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/environment.py +0 -0
  101. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/git.py +0 -0
  102. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/interactive.py +0 -0
  103. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/local_runner.py +0 -0
  104. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/logging.py +0 -0
  105. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/metadata.py +0 -0
  106. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/package_runner.py +0 -0
  107. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/registry.py +0 -0
  108. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/remote_runner.py +0 -0
  109. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/runner.py +0 -0
  110. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/server.py +0 -0
  111. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/source_hash.py +0 -0
  112. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tasks.py +0 -0
  113. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/__init__.py +0 -0
  114. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_config.py +0 -0
  115. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_docker.py +0 -0
  116. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  117. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_env_check.py +0 -0
  118. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_environment.py +0 -0
  119. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_git.py +0 -0
  120. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  121. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_local_runner.py +0 -0
  122. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  123. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_metadata.py +0 -0
  124. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_package_runner.py +0 -0
  125. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_registry_utils.py +0 -0
  126. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_remote_runner.py +0 -0
  127. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_runner_modules.py +0 -0
  128. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_source_hash.py +0 -0
  129. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/tests/test_tasks.py +0 -0
  130. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/version_check.py +0 -0
  131. {hud_python-0.5.0 → hud_python-0.5.1}/hud/cli/utils/viewer.py +0 -0
  132. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/README.md +0 -0
  133. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/__init__.py +0 -0
  134. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/base.py +0 -0
  135. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/environment.py +0 -0
  136. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/fastmcp.py +0 -0
  137. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/mcp_use.py +0 -0
  138. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/__init__.py +0 -0
  139. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_analyze_scenarios.py +0 -0
  140. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_client_integration.py +0 -0
  141. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_fastmcp.py +0 -0
  142. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_mcp_use_retry.py +0 -0
  143. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/tests/test_protocol.py +0 -0
  144. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/__init__.py +0 -0
  145. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/mcp_use_retry.py +0 -0
  146. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/retry.py +0 -0
  147. {hud_python-0.5.0 → hud_python-0.5.1}/hud/clients/utils/retry_transport.py +0 -0
  148. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/__init__.py +0 -0
  149. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/loader.py +0 -0
  150. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/__init__.py +0 -0
  151. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/test_loader.py +0 -0
  152. {hud_python-0.5.0 → hud_python-0.5.1}/hud/datasets/tests/test_utils.py +0 -0
  153. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/__init__.py +0 -0
  154. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connection.py +0 -0
  155. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/__init__.py +0 -0
  156. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/base.py +0 -0
  157. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/local.py +0 -0
  158. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/mcp_config.py +0 -0
  159. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/connectors/openai.py +0 -0
  160. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/__init__.py +0 -0
  161. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/adk.py +0 -0
  162. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/anthropic.py +0 -0
  163. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/gemini.py +0 -0
  164. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/langchain.py +0 -0
  165. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/llamaindex.py +0 -0
  166. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/integrations/openai.py +0 -0
  167. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/mock.py +0 -0
  168. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/router.py +0 -0
  169. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/__init__.py +0 -0
  170. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_connection.py +0 -0
  171. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_connectors.py +0 -0
  172. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_environment.py +0 -0
  173. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_integrations.py +0 -0
  174. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_local_connectors.py +0 -0
  175. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_scenarios.py +0 -0
  176. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/tests/test_tools.py +0 -0
  177. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/types.py +0 -0
  178. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/__init__.py +0 -0
  179. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/formats.py +0 -0
  180. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/schema.py +0 -0
  181. {hud_python-0.5.0 → hud_python-0.5.1}/hud/environment/utils/tool_wrappers.py +0 -0
  182. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/__init__.py +0 -0
  183. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/display.py +0 -0
  184. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/manager.py +0 -0
  185. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/parallel.py +0 -0
  186. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/task.py +0 -0
  187. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/__init__.py +0 -0
  188. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_context.py +0 -0
  189. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_eval.py +0 -0
  190. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_manager.py +0 -0
  191. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_parallel.py +0 -0
  192. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/tests/test_task.py +0 -0
  193. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/types.py +0 -0
  194. {hud_python-0.5.0 → hud_python-0.5.1}/hud/eval/utils.py +0 -0
  195. {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/__init__.py +0 -0
  196. {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/comparator.py +0 -0
  197. {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/__init__.py +0 -0
  198. {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/test_comparator.py +0 -0
  199. {hud_python-0.5.0 → hud_python-0.5.1}/hud/native/tests/test_native_init.py +0 -0
  200. {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/__init__.py +0 -0
  201. {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/mcp_patches.py +0 -0
  202. {hud_python-0.5.0 → hud_python-0.5.1}/hud/patches/warnings.py +0 -0
  203. {hud_python-0.5.0 → hud_python-0.5.1}/hud/py.typed +0 -0
  204. {hud_python-0.5.0 → hud_python-0.5.1}/hud/samples/__init__.py +0 -0
  205. {hud_python-0.5.0 → hud_python-0.5.1}/hud/samples/browser.py +0 -0
  206. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/__init__.py +0 -0
  207. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/context.py +0 -0
  208. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/helper/__init__.py +0 -0
  209. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/low_level.py +0 -0
  210. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/router.py +0 -0
  211. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/server.py +0 -0
  212. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/__init__.py +0 -0
  213. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_add_tool.py +0 -0
  214. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_context.py +0 -0
  215. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  216. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_integration.py +0 -0
  217. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_mcp_server_more.py +0 -0
  218. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_run_wrapper.py +0 -0
  219. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_server_extra.py +0 -0
  220. {hud_python-0.5.0 → hud_python-0.5.1}/hud/server/tests/test_sigterm_runner.py +0 -0
  221. {hud_python-0.5.0 → hud_python-0.5.1}/hud/settings.py +0 -0
  222. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/__init__.py +0 -0
  223. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/exceptions.py +0 -0
  224. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/hints.py +0 -0
  225. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/requests.py +0 -0
  226. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/__init__.py +0 -0
  227. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_exceptions.py +0 -0
  228. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_hints.py +0 -0
  229. {hud_python-0.5.0 → hud_python-0.5.1}/hud/shared/tests/test_requests.py +0 -0
  230. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/__init__.py +0 -0
  231. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/exporter.py +0 -0
  232. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/instrument.py +0 -0
  233. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/__init__.py +0 -0
  234. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_eval_telemetry.py +0 -0
  235. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_exporter.py +0 -0
  236. {hud_python-0.5.0 → hud_python-0.5.1}/hud/telemetry/tests/test_instrument.py +0 -0
  237. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/__init__.py +0 -0
  238. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/apply_patch.py +0 -0
  239. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/base.py +0 -0
  240. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/bash.py +0 -0
  241. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/__init__.py +0 -0
  242. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/anthropic.py +0 -0
  243. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/gemini.py +0 -0
  244. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/hud.py +0 -0
  245. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/openai.py +0 -0
  246. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/qwen.py +0 -0
  247. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/computer/settings.py +0 -0
  248. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/edit.py +0 -0
  249. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/__init__.py +0 -0
  250. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/base.py +0 -0
  251. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/pyautogui.py +0 -0
  252. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/__init__.py +0 -0
  253. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/test_base_executor.py +0 -0
  254. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  255. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/executors/xdo.py +0 -0
  256. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/__init__.py +0 -0
  257. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/config.py +0 -0
  258. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/grounded_tool.py +0 -0
  259. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/grounder.py +0 -0
  260. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/tests/__init__.py +0 -0
  261. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  262. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/jupyter.py +0 -0
  263. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/playwright.py +0 -0
  264. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/response.py +0 -0
  265. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/shell.py +0 -0
  266. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/submit.py +0 -0
  267. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/__init__.py +0 -0
  268. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_apply_patch.py +0 -0
  269. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_base.py +0 -0
  270. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_bash.py +0 -0
  271. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_bash_extended.py +0 -0
  272. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_computer.py +0 -0
  273. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_computer_actions.py +0 -0
  274. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_edit.py +0 -0
  275. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_init.py +0 -0
  276. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_jupyter_tool.py +0 -0
  277. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_playwright_tool.py +0 -0
  278. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_response.py +0 -0
  279. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_shell.py +0 -0
  280. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_submit.py +0 -0
  281. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_tools.py +0 -0
  282. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_tools_init.py +0 -0
  283. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_types.py +0 -0
  284. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/tests/test_utils.py +0 -0
  285. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/types.py +0 -0
  286. {hud_python-0.5.0 → hud_python-0.5.1}/hud/tools/utils.py +0 -0
  287. {hud_python-0.5.0 → hud_python-0.5.1}/hud/types.py +0 -0
  288. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/__init__.py +0 -0
  289. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/env.py +0 -0
  290. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/mcp.py +0 -0
  291. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/pretty_errors.py +0 -0
  292. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/strict_schema.py +0 -0
  293. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/telemetry.py +0 -0
  294. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/__init__.py +0 -0
  295. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_init.py +0 -0
  296. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_mcp.py +0 -0
  297. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_pretty_errors.py +0 -0
  298. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_telemetry.py +0 -0
  299. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tests/test_tool_shorthand.py +0 -0
  300. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/tool_shorthand.py +0 -0
  301. {hud_python-0.5.0 → hud_python-0.5.1}/hud/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.5.0
3
+ Version: 0.5.1
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -166,14 +166,21 @@ from hud import Environment
166
166
  env = Environment("my-env")
167
167
 
168
168
  @env.tool()
169
- def search(query: str) -> str:
170
- """Search the knowledge base."""
171
- return db.search(query)
172
-
173
- @env.scenario("find-answer")
174
- async def find_answer(question: str, answer: str):
175
- response = yield f"Find: {question}" # Prompt
176
- yield 1.0 if answer in response else 0.0 # Reward
169
+ def add(a: int, b: int) -> int:
170
+ """Add two numbers."""
171
+ return a + b
172
+
173
+ @env.scenario("solve-math")
174
+ async def solve_math(problem: str, answer: int):
175
+ response = yield problem # Prompt
176
+ yield 1.0 if str(answer) in response else 0.0 # Reward
177
+
178
+ async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
179
+ # Your agent logic here - call tools, get response
180
+ result = await ctx.call_tool("add", a=2, b=2)
181
+ await ctx.submit(f"The answer is {result}")
182
+
183
+ print(ctx.reward) # 1.0
177
184
  ```
178
185
 
179
186
  The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
@@ -183,14 +190,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
183
190
  Test different models. Repeat runs to see the distribution:
184
191
 
185
192
  ```python
186
- import hud
193
+ from openai import AsyncOpenAI
194
+ import os
187
195
 
188
- task = env("find-answer", question="What is 2+2?", answer="4")
196
+ client = AsyncOpenAI(
197
+ base_url="https://inference.hud.ai",
198
+ api_key=os.environ["HUD_API_KEY"]
199
+ )
189
200
 
190
- async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
201
+ # Using the env from above
202
+ async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
191
203
  response = await client.chat.completions.create(
192
204
  model=ctx.variants["model"],
193
- messages=[{"role": "user", "content": ctx.prompt}]
205
+ messages=[{"role": "user", "content": ctx.prompt}],
206
+ tools=ctx.tools # Environment tools available to the model
194
207
  )
195
208
  await ctx.submit(response.choices[0].message.content)
196
209
  ```
@@ -205,7 +218,7 @@ Push to GitHub, connect on hud.ai, run at scale:
205
218
  hud init # Scaffold environment
206
219
  git push # Push to GitHub
207
220
  # Connect on hud.ai → New → Environment
208
- hud eval my-org/my-eval --model gpt-4o --group-size 100
221
+ hud eval my-eval --model gpt-4o --group-size 100
209
222
  # Or create and run tasks on the platform
210
223
  ```
211
224
 
@@ -68,14 +68,21 @@ from hud import Environment
68
68
  env = Environment("my-env")
69
69
 
70
70
  @env.tool()
71
- def search(query: str) -> str:
72
- """Search the knowledge base."""
73
- return db.search(query)
74
-
75
- @env.scenario("find-answer")
76
- async def find_answer(question: str, answer: str):
77
- response = yield f"Find: {question}" # Prompt
78
- yield 1.0 if answer in response else 0.0 # Reward
71
+ def add(a: int, b: int) -> int:
72
+ """Add two numbers."""
73
+ return a + b
74
+
75
+ @env.scenario("solve-math")
76
+ async def solve_math(problem: str, answer: int):
77
+ response = yield problem # Prompt
78
+ yield 1.0 if str(answer) in response else 0.0 # Reward
79
+
80
+ async with env("solve-math", problem="What is 2+2?", answer=4) as ctx:
81
+ # Your agent logic here - call tools, get response
82
+ result = await ctx.call_tool("add", a=2, b=2)
83
+ await ctx.submit(f"The answer is {result}")
84
+
85
+ print(ctx.reward) # 1.0
79
86
  ```
80
87
 
81
88
  The agent runs between the yields. First yield sends the prompt, second yield scores the result. → [Docs](https://docs.hud.ai/quick-links/environments) · [Templates](https://hud.ai/environments)
@@ -85,14 +92,20 @@ The agent runs between the yields. First yield sends the prompt, second yield sc
85
92
  Test different models. Repeat runs to see the distribution:
86
93
 
87
94
  ```python
88
- import hud
95
+ from openai import AsyncOpenAI
96
+ import os
89
97
 
90
- task = env("find-answer", question="What is 2+2?", answer="4")
98
+ client = AsyncOpenAI(
99
+ base_url="https://inference.hud.ai",
100
+ api_key=os.environ["HUD_API_KEY"]
101
+ )
91
102
 
92
- async with hud.eval(task, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
103
+ # Using the env from above
104
+ async with env("solve-math", problem="What is 2+2?", answer=4, variants={"model": ["gpt-4o", "claude-sonnet-4-5"]}, group=5) as ctx:
93
105
  response = await client.chat.completions.create(
94
106
  model=ctx.variants["model"],
95
- messages=[{"role": "user", "content": ctx.prompt}]
107
+ messages=[{"role": "user", "content": ctx.prompt}],
108
+ tools=ctx.tools # Environment tools available to the model
96
109
  )
97
110
  await ctx.submit(response.choices[0].message.content)
98
111
  ```
@@ -107,7 +120,7 @@ Push to GitHub, connect on hud.ai, run at scale:
107
120
  hud init # Scaffold environment
108
121
  git push # Push to GitHub
109
122
  # Connect on hud.ai → New → Environment
110
- hud eval my-org/my-eval --model gpt-4o --group-size 100
123
+ hud eval my-eval --model gpt-4o --group-size 100
111
124
  # Or create and run tasks on the platform
112
125
  ```
113
126
 
@@ -18,7 +18,7 @@ from .telemetry.instrument import instrument
18
18
  def trace(*args: object, **kwargs: object) -> EvalContext:
19
19
  """Deprecated: Use hud.eval() instead.
20
20
 
21
- .. deprecated:: 0.5.0
21
+ .. deprecated:: 0.5.1
22
22
  hud.trace() is deprecated. Use hud.eval() or env.eval() instead.
23
23
  """
24
24
  warnings.warn(
@@ -182,7 +182,23 @@ class MCPAgent(ABC):
182
182
  raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
183
183
 
184
184
  if not ctx.prompt:
185
- raise ValueError("ctx.prompt is not set - did the scenario setup run?")
185
+ if ctx.has_scenario:
186
+ # Scenario was specified but prompt is still empty
187
+ # (e.g., scenario returned empty string, or edge case not caught in scenarios.py)
188
+ scenario = ctx._task.scenario if ctx._task else "unknown"
189
+ raise ValueError(
190
+ f"ctx.prompt is not set.\n\n"
191
+ f"Scenario '{scenario}' was specified but returned an empty prompt.\n"
192
+ f"Check that the scenario's setup function returns a non-empty string."
193
+ )
194
+ else:
195
+ # No scenario specified at all
196
+ raise ValueError(
197
+ "ctx.prompt is not set.\n\n"
198
+ "No scenario was specified in your task file.\n"
199
+ "Either add a 'scenario' field to your task, or set ctx.prompt manually "
200
+ "before running the agent."
201
+ )
186
202
 
187
203
  # Store context for tool calls
188
204
  self.ctx = ctx
@@ -194,6 +210,11 @@ class MCPAgent(ABC):
194
210
  try:
195
211
  result = await self._run_context(text_to_blocks(ctx.prompt), max_steps=max_steps)
196
212
 
213
+ # Propagate error state to context for platform visibility
214
+ if result.isError and hasattr(ctx, "error"):
215
+ error_msg = result.info.get("error") if result.info else result.content
216
+ ctx.error = Exception(str(error_msg)) if error_msg else Exception("Agent error")
217
+
197
218
  # Submit final answer to context (only if scenario is running)
198
219
  if result.content and ctx.has_scenario:
199
220
  await ctx.submit(result.content)
@@ -202,6 +223,9 @@ class MCPAgent(ABC):
202
223
 
203
224
  except Exception as e:
204
225
  logger.exception("Error while running agent:")
226
+ # Propagate error to context for platform visibility
227
+ if hasattr(ctx, "error"):
228
+ ctx.error = e
205
229
  return Trace(
206
230
  reward=0.0,
207
231
  done=True,
@@ -537,7 +561,7 @@ def find_reward(result: MCPToolResult) -> float:
537
561
  except json.JSONDecodeError:
538
562
  pass
539
563
 
540
- logger.error("Couldn't parse reward from result: %s", result)
564
+ logger.error("Couldn't parse reward from result: %s", str(result.structuredContent))
541
565
  return 0.0
542
566
 
543
567
 
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from typing import Literal
4
5
 
5
6
  from openai import AsyncOpenAI
6
7
 
7
8
  from hud.settings import settings
8
9
 
10
+ logger = logging.getLogger(__name__)
11
+
9
12
  ResponseType = Literal["STOP", "CONTINUE"]
10
13
 
11
14
  DEFAULT_SYSTEM_PROMPT = """\
@@ -97,5 +100,6 @@ class ResponseAgent:
97
100
  else:
98
101
  return "CONTINUE"
99
102
 
100
- except Exception:
103
+ except Exception as e:
104
+ logger.warning("Auto-respond failed: %s", e)
101
105
  return "CONTINUE" # Default to continue on error
@@ -70,6 +70,18 @@ class OpenAIChatAgent(MCPAgent):
70
70
  super().__init__(params, **kwargs)
71
71
  self.config: OpenAIChatConfig
72
72
 
73
+ if (
74
+ self.config.api_key
75
+ and self.config.base_url
76
+ and settings.hud_gateway_url in self.config.base_url
77
+ and settings.api_key
78
+ and self.config.api_key != settings.api_key
79
+ ):
80
+ raise ValueError(
81
+ "OpenAIChatAgent api_key is not allowed with HUD Gateway. "
82
+ "Use HUD_API_KEY for gateway auth and BYOK headers for provider keys."
83
+ )
84
+
73
85
  if self.config.openai_client is not None:
74
86
  self.oai = self.config.openai_client
75
87
  elif self.config.api_key is not None or self.config.base_url is not None:
@@ -350,3 +350,67 @@ class TestMCPAgentToolSchemas:
350
350
  assert len(schemas) == 1
351
351
  assert schemas[0]["name"] == "my_tool"
352
352
  assert schemas[0]["description"] == "My tool description"
353
+
354
+
355
+ class TestMCPAgentErrorPropagation:
356
+ """Tests for error propagation to EvalContext."""
357
+
358
+ @pytest.mark.asyncio
359
+ async def test_exception_propagates_to_ctx_error(self) -> None:
360
+ """Test that exceptions during run() set ctx.error for platform visibility."""
361
+
362
+ class FailingAgent(MockMCPAgent):
363
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
364
+ raise RuntimeError("Agent crashed")
365
+
366
+ ctx = MockEvalContext(prompt="Do something")
367
+ agent = FailingAgent()
368
+
369
+ result = await agent.run(ctx)
370
+
371
+ # Should return error trace
372
+ assert result.isError is True
373
+ assert result.content is not None
374
+ assert "Agent crashed" in result.content
375
+
376
+ assert ctx.error is not None
377
+ assert isinstance(ctx.error, BaseException)
378
+ assert "Agent crashed" in str(ctx.error)
379
+
380
+ @pytest.mark.asyncio
381
+ async def test_step_error_propagates_to_ctx_error(self) -> None:
382
+ """Test that step-level errors (caught internally) set ctx.error."""
383
+ step_count = [0]
384
+
385
+ class FailOnSecondStepAgent(MockMCPAgent):
386
+ async def get_response(self, messages: list[dict[str, Any]]) -> AgentResponse:
387
+ step_count[0] += 1
388
+ if step_count[0] == 1:
389
+ return AgentResponse(
390
+ content="",
391
+ tool_calls=[MCPToolCall(name="test_tool", arguments={})],
392
+ done=False,
393
+ )
394
+ else:
395
+ raise ValueError("Step 2 failed")
396
+
397
+ ctx = MockEvalContext(prompt="Do something")
398
+ agent = FailOnSecondStepAgent()
399
+
400
+ result = await agent.run(ctx)
401
+
402
+ # Should return error trace
403
+ assert result.isError is True
404
+ assert ctx.error is not None
405
+ assert "Step 2 failed" in str(ctx.error)
406
+
407
+ @pytest.mark.asyncio
408
+ async def test_no_error_when_successful(self) -> None:
409
+ """Test that ctx.error remains None on successful run."""
410
+ ctx = MockEvalContext(prompt="Do something")
411
+ agent = MockMCPAgent()
412
+
413
+ result = await agent.run(ctx)
414
+
415
+ assert result.isError is False
416
+ assert ctx.error is None
@@ -91,10 +91,11 @@ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
91
91
  [eval]
92
92
  # source = "hud-evals/SheetBench-50"
93
93
  # agent = "claude"
94
- # full = false
94
+ # all = false # Run all problems instead of just 1
95
95
  # max_concurrent = 30
96
96
  # max_steps = 10
97
97
  # group_size = 1
98
+ # byok = false # Remote only; use encrypted env vars on the platform.
98
99
  # task_ids = ["task_1", "task_2"]
99
100
  # verbose = true
100
101
  # very_verbose = true
@@ -152,12 +153,13 @@ class EvalConfig(BaseModel):
152
153
  "source",
153
154
  "agent_type",
154
155
  "task_ids",
155
- "full",
156
+ "all",
156
157
  "max_concurrent",
157
158
  "max_steps",
158
159
  "verbose",
159
160
  "very_verbose",
160
161
  "group_size",
162
+ "byok",
161
163
  "remote",
162
164
  "auto_respond",
163
165
  "quiet",
@@ -171,13 +173,14 @@ class EvalConfig(BaseModel):
171
173
  agent_type: AgentType | None = None
172
174
  model: str | None = None
173
175
  task_ids: list[str] | None = None
174
- full: bool = False
176
+ all: bool = False # Run all problems instead of just 1
175
177
  max_concurrent: int = 30
176
- max_steps: int | None = None
178
+ max_steps: int = 10
177
179
  verbose: bool = False
178
180
  very_verbose: bool = False
179
- auto_respond: bool | None = None # Continue without prompting (default: True for --full)
181
+ auto_respond: bool | None = None # Continue without prompting
180
182
  group_size: int = 1
183
+ byok: bool = False
181
184
  remote: bool = False
182
185
  quiet: bool = False # Suppress opening browser for eval links
183
186
  gateway: bool = False # Use HUD Gateway for LLM API calls
@@ -208,6 +211,11 @@ class EvalConfig(BaseModel):
208
211
 
209
212
  def validate_api_keys(self) -> None:
210
213
  """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
214
+ # BYOK requires remote execution (check before agent_type guard)
215
+ if self.byok and not self.remote:
216
+ hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
217
+ raise typer.Exit(1)
218
+
211
219
  if self.agent_type is None:
212
220
  return
213
221
 
@@ -284,14 +292,11 @@ class EvalConfig(BaseModel):
284
292
  if self.model:
285
293
  kwargs["model"] = self.model
286
294
 
287
- if self.agent_type == AgentType.OPENAI_COMPATIBLE:
295
+ # For gateway base_url, inject HUD API key if not already set
296
+ if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
288
297
  base_url = kwargs.get("base_url", "")
289
- if "api_key" not in kwargs:
290
- # Use HUD API key for gateway, otherwise fall back to OpenAI API key
291
- if settings.hud_gateway_url in base_url:
292
- kwargs["api_key"] = settings.api_key
293
- elif settings.openai_api_key:
294
- kwargs["api_key"] = settings.openai_api_key
298
+ if settings.hud_gateway_url in base_url and settings.api_key:
299
+ kwargs["api_key"] = settings.api_key
295
300
 
296
301
  # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
297
302
  # Check both model and checkpoint_name for ARN patterns
@@ -454,12 +459,20 @@ class EvalConfig(BaseModel):
454
459
 
455
460
  overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
456
461
 
457
- for k in ("full", "verbose", "very_verbose", "remote", "quiet", "gateway"):
462
+ for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
458
463
  if cli_args.get(k) is True:
459
464
  overrides[k] = True
460
465
  elif k in overrides and cli_args.get(k) is False:
461
466
  del overrides[k]
462
467
 
468
+ # --full is a shortcut for --all --auto-respond --max-steps 100
469
+ if overrides.get("full"):
470
+ overrides["all"] = True
471
+ if "auto_respond" not in overrides:
472
+ overrides["auto_respond"] = True
473
+ if "max_steps" not in overrides:
474
+ overrides["max_steps"] = 100
475
+
463
476
  if config:
464
477
  merged_agent_config = dict(self.agent_config)
465
478
  for item in config:
@@ -541,15 +554,13 @@ class EvalConfig(BaseModel):
541
554
  table.add_row(
542
555
  "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
543
556
  )
544
- table.add_row("full", str(self.full))
545
- table.add_row("max_steps", str(self.max_steps or (100 if self.full else 10)))
557
+ table.add_row("all", str(self.all))
558
+ table.add_row("max_steps", str(self.max_steps))
546
559
  if not self.remote:
547
560
  table.add_row("max_concurrent", str(self.max_concurrent))
548
561
  if self.group_size > 1:
549
562
  table.add_row("group_size", str(self.group_size))
550
- # Show auto_respond when it will be true (explicit or via --full)
551
- effective_auto_respond = self.auto_respond if self.auto_respond is not None else self.full
552
- if effective_auto_respond:
563
+ if self.auto_respond:
553
564
  table.add_row("auto_respond", "[bold green]True[/bold green]")
554
565
  if self.very_verbose:
555
566
  table.add_row("very_verbose", "[bold green]True[/bold green]")
@@ -559,6 +570,8 @@ class EvalConfig(BaseModel):
559
570
  table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
560
571
  if self.gateway:
561
572
  table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
573
+ if self.byok:
574
+ table.add_row("byok", "[bold green]True[/bold green] (remote only)")
562
575
 
563
576
  # Tool filters (only if set)
564
577
  if self.allowed_tools:
@@ -642,8 +655,8 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
642
655
  raise typer.Exit(1)
643
656
  hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
644
657
  tasks = filtered
645
- elif not cfg.full:
646
- # Single task mode (no --full, no --task-ids)
658
+ elif not cfg.all:
659
+ # Single task mode (no --all, --full, or --task-ids)
647
660
  tasks = [tasks[0]]
648
661
  hud_console.info("Using first task (run with --full or --task-ids for more)…")
649
662
 
@@ -651,14 +664,17 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
651
664
 
652
665
  # Prepare agent kwargs
653
666
  agent_kwargs = cfg.get_agent_kwargs()
654
- auto_respond = cfg.auto_respond if cfg.auto_respond is not None else cfg.full
667
+ auto_respond = cfg.auto_respond
655
668
  if auto_respond:
656
669
  agent_kwargs = {**agent_kwargs, "auto_respond": True}
657
670
 
658
- max_steps = cfg.max_steps or (100 if cfg.full else 10)
671
+ max_steps = cfg.max_steps
659
672
 
660
673
  # Remote execution - submit to HUD platform
661
674
  if cfg.remote:
675
+ agent_kwargs = {
676
+ k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
677
+ }
662
678
  # Create a job ID for tracking
663
679
  import uuid
664
680
 
@@ -676,9 +692,10 @@ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
676
692
  agent_params=agent_kwargs,
677
693
  max_steps=max_steps,
678
694
  group_size=cfg.group_size,
695
+ use_byok=cfg.byok,
679
696
  )
680
697
 
681
- hud_console.success(f"Tasks submitted. View at: https://hud.ai/job/{job_id}")
698
+ hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
682
699
  return [], tasks
683
700
 
684
701
  # Single task mode - show extra info
@@ -724,7 +741,12 @@ def eval_command(
724
741
  None,
725
742
  help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
726
743
  ),
727
- full: bool = typer.Option(False, "--full", help="Run entire dataset"),
744
+ all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
745
+ full: bool = typer.Option(
746
+ False,
747
+ "--full",
748
+ help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
749
+ ),
728
750
  model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
729
751
  config: list[str] | None = typer.Option( # noqa: B008
730
752
  None, "--config", "-c", help="Agent config: key=value"
@@ -743,10 +765,10 @@ def eval_command(
743
765
  max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
744
766
  verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
745
767
  very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
746
- auto_respond: bool | None = typer.Option(
747
- None,
768
+ auto_respond: bool = typer.Option(
769
+ False,
748
770
  "--auto-respond",
749
- help="Continue without prompting after tool calls (default: True for --full)",
771
+ help="Automatically prompt the agent to continue if it does not respond with a tool call",
750
772
  ),
751
773
  group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
752
774
  task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
@@ -754,6 +776,11 @@ def eval_command(
754
776
  remote: bool = typer.Option(
755
777
  False, "--remote", help="Submit tasks to platform for remote execution"
756
778
  ),
779
+ byok: bool = typer.Option(
780
+ False,
781
+ "--byok",
782
+ help="Remote only: use BYOK keys from encrypted env vars for inference",
783
+ ),
757
784
  quiet: bool = typer.Option(
758
785
  False, "--quiet", "-q", help="Suppress opening browser for eval links"
759
786
  ),
@@ -778,6 +805,7 @@ def eval_command(
778
805
  source=source,
779
806
  agent=agent,
780
807
  model=model,
808
+ all=all,
781
809
  full=full,
782
810
  max_concurrent=max_concurrent,
783
811
  max_steps=max_steps,
@@ -790,6 +818,7 @@ def eval_command(
790
818
  group_size=group_size,
791
819
  config=config,
792
820
  remote=remote,
821
+ byok=byok,
793
822
  quiet=quiet,
794
823
  gateway=gateway,
795
824
  )
@@ -23,6 +23,8 @@ PRESET_MAP: dict[str, str | None] = {
23
23
  "deep-research": "hud-deepresearch",
24
24
  "browser": "hud-browser",
25
25
  "rubrics": "hud-rubrics",
26
+ "verilog-coding-template": "verilog-coding-template",
27
+ "data-science-template": "data-science-template",
26
28
  }
27
29
 
28
30
  SKIP_DIR_NAMES = {"node_modules", "__pycache__", "dist", "build", ".next", ".git"}
@@ -92,6 +94,8 @@ def _prompt_for_preset() -> str:
92
94
  {"name": "browser", "message": "browser"},
93
95
  {"name": "deep-research", "message": "deep-research"},
94
96
  {"name": "rubrics", "message": "rubrics"},
97
+ {"name": "verilog-coding-template", "message": "verilog-coding-template"},
98
+ {"name": "data-science-template", "message": "data-science-template"},
95
99
  ]
96
100
  display_choices = [c["message"] for c in choices]
97
101
  selected = questionary.select(
@@ -99,8 +99,8 @@ async def run_dataset(
99
99
  ) as ctx:
100
100
  # Create agent fresh for each context (ensures correct tool initialization)
101
101
  agent = agent_cls.create(**(agent_params or {}))
102
- result = await agent.run(ctx, max_steps=max_steps)
103
- ctx.reward = result.reward
102
+ await agent.run(ctx, max_steps=max_steps)
103
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
104
104
 
105
105
  # For parallel execution, results are collected via ctx.results
106
106
  if hasattr(ctx, "results") and ctx.results:
@@ -207,6 +207,7 @@ async def run_single_task(
207
207
  ctx.metadata.update(metadata)
208
208
 
209
209
  result = await agent.run(ctx, max_steps=max_steps)
210
- ctx.reward = result.reward
210
+ # Reward is computed by EvalContext.__aexit__ from evaluate tools
211
211
 
212
+ # Return the Trace (ctx.reward is set by EvalContext.__aexit__)
212
213
  return result
@@ -51,6 +51,10 @@ class SingleTaskRequest(BaseModel):
51
51
  description="Additional metadata to inject into the trace context.",
52
52
  )
53
53
  trace_id: str | None = Field(default=None, description="Pre-assigned trace ID.")
54
+ use_byok: bool = Field(
55
+ default=False,
56
+ description="If True, use BYOK headers from encrypted env vars for inference.",
57
+ )
54
58
 
55
59
  @model_validator(mode="after")
56
60
  def _validate_task(self) -> SingleTaskRequest:
@@ -110,6 +114,7 @@ async def submit_rollouts(
110
114
  group_size: int = 1,
111
115
  batch_size: int = 50,
112
116
  metadata: dict[str, Any] | None = None,
117
+ use_byok: bool = False,
113
118
  ) -> None:
114
119
  """Submit rollouts to the HUD platform API for remote execution (fire-and-forget).
115
120
 
@@ -122,6 +127,7 @@ async def submit_rollouts(
122
127
  group_size: Number of rollouts per task (for variance estimation)
123
128
  batch_size: Number of rollouts per API batch request
124
129
  metadata: Additional metadata for each rollout
130
+ use_byok: If True, use BYOK keys from encrypted env vars (remote only)
125
131
  """
126
132
  from hud.eval.utils import is_v4_format
127
133
 
@@ -168,6 +174,7 @@ async def submit_rollouts(
168
174
  trace_name=trace_name,
169
175
  group_id=base_task_id if group_size > 1 else None,
170
176
  metadata=metadata or {},
177
+ use_byok=use_byok,
171
178
  )
172
179
  )
173
180
 
@@ -61,13 +61,12 @@ class RemoteConnectorMixin(MCPConfigConnectorMixin):
61
61
  self._hub_config = hub_config
62
62
 
63
63
  # Create mcp_config with standard MCP URL and hub slug in headers
64
+ # Note: Authorization is injected at request time by httpx/aiohttp hooks
65
+ # in hud.eval.instrument (uses contextvar for api_key).
64
66
  mcp_config = {
65
67
  "hud": {
66
68
  "url": settings.hud_mcp_url,
67
- "headers": {
68
- "Authorization": f"Bearer {settings.api_key}",
69
- "Environment-Name": slug,
70
- },
69
+ "headers": {"Environment-Name": slug},
71
70
  }
72
71
  }
73
72
 
@@ -323,7 +323,8 @@ class Environment(
323
323
  if conn.is_connected:
324
324
  await conn.disconnect()
325
325
  name, err = errors[0]
326
- raise ConnectionError(f"Failed to connect to {name}") from err
326
+ str_err = str(err).replace("Client failed to connect: ", "") # Strip from FastMCP
327
+ raise ConnectionError(f"Failed to connect to {name}: {str_err}") from err
327
328
 
328
329
  await self._build_routing()
329
330
 
@@ -399,13 +400,20 @@ class Environment(
399
400
  if self._router.is_local(name):
400
401
  # Call tool manager directly to avoid FastMCP context requirement
401
402
  result = await self._tool_manager.call_tool(name, arguments)
402
- return MCPToolResult(content=result.content, isError=False)
403
+ return MCPToolResult(
404
+ content=result.content,
405
+ structuredContent=result.structured_content,
406
+ )
403
407
 
404
408
  connection_name = self._router.get_connection(name)
405
409
  if connection_name:
406
410
  conn = self._connections[connection_name]
407
411
  result = await conn.call_tool(name, arguments)
408
- return MCPToolResult(content=result.content, isError=result.isError)
412
+ return MCPToolResult(
413
+ content=result.content,
414
+ isError=result.isError,
415
+ structuredContent=result.structuredContent,
416
+ )
409
417
 
410
418
  raise ValueError(f"Tool not found: {name}")
411
419