hud-python 0.5.28__tar.gz → 0.5.30__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (336) hide show
  1. {hud_python-0.5.28 → hud_python-0.5.30}/PKG-INFO +5 -7
  2. {hud_python-0.5.28 → hud_python-0.5.30}/hud/__init__.py +2 -0
  3. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/base.py +148 -38
  4. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/claude.py +159 -49
  5. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/gemini.py +80 -4
  6. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/gemini_cua.py +2 -2
  7. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/grounded_openai.py +7 -7
  8. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/misc/integration_test_agent.py +2 -2
  9. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/misc/response_agent.py +22 -11
  10. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/openai.py +194 -56
  11. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/openai_chat.py +4 -4
  12. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/operator.py +7 -90
  13. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/conftest.py +3 -3
  14. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_base.py +11 -11
  15. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_base_runtime.py +6 -6
  16. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_claude.py +289 -3
  17. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_gemini.py +259 -3
  18. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_openai.py +164 -4
  19. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_operator.py +70 -3
  20. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_resolver.py +6 -6
  21. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_run_eval.py +95 -8
  22. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/types.py +2 -1
  23. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/build.py +6 -0
  24. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/deploy.py +121 -53
  25. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/dev.py +28 -16
  26. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/eval.py +46 -40
  27. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_build.py +2 -2
  28. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_dev.py +9 -4
  29. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_eval.py +2 -2
  30. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/build_display.py +53 -0
  31. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/mcp.py +9 -2
  32. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/__init__.py +1 -1
  33. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/loader.py +43 -90
  34. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/runner.py +47 -3
  35. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/tests/test_loader.py +18 -14
  36. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/utils.py +0 -7
  37. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connection.py +50 -11
  38. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/base.py +1 -1
  39. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/local.py +1 -1
  40. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/mcp_config.py +1 -1
  41. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/openai.py +2 -2
  42. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/remote.py +1 -1
  43. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/environment.py +256 -32
  44. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/router.py +2 -2
  45. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/scenarios.py +488 -184
  46. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_connection.py +31 -0
  47. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_environment.py +99 -0
  48. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_scenarios.py +368 -185
  49. hud_python-0.5.30/hud/environment/tests/test_session_id.py +159 -0
  50. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_tools.py +18 -4
  51. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/utils/formats.py +4 -5
  52. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/context.py +42 -12
  53. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/manager.py +40 -80
  54. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/test_context.py +59 -0
  55. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/test_manager.py +87 -1
  56. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/types.py +1 -1
  57. hud_python-0.5.30/hud/native/__init__.py +1 -0
  58. hud_python-0.5.30/hud/native/chat.py +74 -0
  59. {hud_python-0.5.28 → hud_python-0.5.30}/hud/patches/mcp_patches.py +16 -9
  60. hud_python-0.5.30/hud/server/router.py +122 -0
  61. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/server.py +43 -38
  62. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_mcp_server_more.py +4 -2
  63. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_prefix_naming.py +6 -5
  64. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_server_extra.py +2 -1
  65. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_sigterm_runner.py +2 -1
  66. hud_python-0.5.30/hud/services/__init__.py +9 -0
  67. hud_python-0.5.30/hud/services/chat.py +365 -0
  68. hud_python-0.5.30/hud/services/chat_service.py +274 -0
  69. hud_python-0.5.30/hud/services/reply_metadata.py +50 -0
  70. hud_python-0.5.30/hud/services/tests/test_chat.py +282 -0
  71. hud_python-0.5.30/hud/services/tests/test_chat_service.py +152 -0
  72. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/instrument.py +18 -1
  73. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/tests/test_instrument.py +22 -0
  74. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/agent.py +1 -1
  75. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/base.py +30 -33
  76. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/apply_patch.py +3 -0
  77. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/shell.py +3 -0
  78. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/anthropic.py +102 -26
  79. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/openai.py +53 -28
  80. hud_python-0.5.30/hud/tools/elicitation.py +91 -0
  81. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/__init__.py +2 -0
  82. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/base.py +3 -10
  83. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/code_execution.py +20 -5
  84. hud_python-0.5.30/hud/tools/hosted/tool_search.py +82 -0
  85. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_base.py +20 -32
  86. hud_python-0.5.30/hud/tools/tests/test_elicitation.py +118 -0
  87. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_native_tool_e2e.py +9 -9
  88. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_native_types.py +54 -18
  89. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_types.py +77 -0
  90. hud_python-0.5.30/hud/tools/types.py +279 -0
  91. {hud_python-0.5.28 → hud_python-0.5.30}/hud/types.py +22 -2
  92. hud_python-0.5.30/hud/utils/tests/__init__.py +0 -0
  93. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/tests/test_version.py +1 -1
  94. {hud_python-0.5.28 → hud_python-0.5.30}/hud/version.py +1 -1
  95. {hud_python-0.5.28 → hud_python-0.5.30}/pyproject.toml +6 -6
  96. hud_python-0.5.28/hud/server/router.py +0 -164
  97. hud_python-0.5.28/hud/tools/types.py +0 -155
  98. {hud_python-0.5.28 → hud_python-0.5.30}/.gitignore +0 -0
  99. {hud_python-0.5.28 → hud_python-0.5.30}/LICENSE +0 -0
  100. {hud_python-0.5.28 → hud_python-0.5.30}/README.md +0 -0
  101. {hud_python-0.5.28 → hud_python-0.5.30}/examples/README.md +0 -0
  102. {hud_python-0.5.28 → hud_python-0.5.30}/hud/__main__.py +0 -0
  103. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/__init__.py +0 -0
  104. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/gateway.py +0 -0
  105. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/misc/__init__.py +0 -0
  106. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/resolver.py +0 -0
  107. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/__init__.py +0 -0
  108. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_grounded_openai_agent.py +0 -0
  109. {hud_python-0.5.28 → hud_python-0.5.30}/hud/agents/tests/test_integration_test_agent.py +0 -0
  110. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/__init__.py +0 -0
  111. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/__main__.py +0 -0
  112. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/analyze.py +0 -0
  113. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/cancel.py +0 -0
  114. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/__init__.py +0 -0
  115. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/base.py +0 -0
  116. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/harbor.py +0 -0
  117. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/tests/__init__.py +0 -0
  118. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/tests/conftest.py +0 -0
  119. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/convert/tests/test_harbor.py +0 -0
  120. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/debug.py +0 -0
  121. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/__init__.py +0 -0
  122. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/dev.py +0 -0
  123. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/init.py +0 -0
  124. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/tasks.py +0 -0
  125. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/templates.py +0 -0
  126. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/tests/__init__.py +0 -0
  127. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/flows/tests/test_dev.py +0 -0
  128. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/init.py +0 -0
  129. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/link.py +0 -0
  130. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/models.py +0 -0
  131. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/push.py +0 -0
  132. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/rft.py +0 -0
  133. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/rft_status.py +0 -0
  134. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/__init__.py +0 -0
  135. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_analyze.py +0 -0
  136. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_analyze_metadata.py +0 -0
  137. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_analyze_module.py +0 -0
  138. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_build_failure.py +0 -0
  139. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_build_module.py +0 -0
  140. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_cli_init.py +0 -0
  141. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_cli_main.py +0 -0
  142. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_cli_more_wrappers.py +0 -0
  143. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_cli_root.py +0 -0
  144. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_convert.py +0 -0
  145. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_debug.py +0 -0
  146. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_debug_directory_mode.py +0 -0
  147. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_deploy.py +0 -0
  148. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_eval_bedrock.py +0 -0
  149. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_init.py +0 -0
  150. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_main_module.py +0 -0
  151. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_mcp_server.py +0 -0
  152. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_push.py +0 -0
  153. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_push_happy.py +0 -0
  154. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_push_wrapper.py +0 -0
  155. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/tests/test_utils.py +0 -0
  156. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/__init__.py +0 -0
  157. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/api.py +0 -0
  158. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/args.py +0 -0
  159. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/build_logs.py +0 -0
  160. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/config.py +0 -0
  161. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/context.py +0 -0
  162. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/docker.py +0 -0
  163. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/env_check.py +0 -0
  164. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/environment.py +0 -0
  165. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/git.py +0 -0
  166. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/interactive.py +0 -0
  167. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/lockfile.py +0 -0
  168. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/logging.py +0 -0
  169. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/metadata.py +0 -0
  170. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/server.py +0 -0
  171. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/source_hash.py +0 -0
  172. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tasks.py +0 -0
  173. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/__init__.py +0 -0
  174. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_config.py +0 -0
  175. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_docker.py +0 -0
  176. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_docker_hints.py +0 -0
  177. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_env_check.py +0 -0
  178. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_environment.py +0 -0
  179. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_git.py +0 -0
  180. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_interactive_module.py +0 -0
  181. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_logging_utils.py +0 -0
  182. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_metadata.py +0 -0
  183. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_source_hash.py +0 -0
  184. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/tests/test_tasks.py +0 -0
  185. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/validation.py +0 -0
  186. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/version_check.py +0 -0
  187. {hud_python-0.5.28 → hud_python-0.5.30}/hud/cli/utils/viewer.py +0 -0
  188. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/tests/__init__.py +0 -0
  189. {hud_python-0.5.28 → hud_python-0.5.30}/hud/datasets/tests/test_utils.py +0 -0
  190. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/__init__.py +0 -0
  191. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/connectors/__init__.py +0 -0
  192. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/__init__.py +0 -0
  193. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/adk.py +0 -0
  194. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/anthropic.py +0 -0
  195. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/gemini.py +0 -0
  196. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/langchain.py +0 -0
  197. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/llamaindex.py +0 -0
  198. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/integrations/openai.py +0 -0
  199. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/mock.py +0 -0
  200. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/__init__.py +0 -0
  201. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_connectors.py +0 -0
  202. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_integrations.py +0 -0
  203. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/tests/test_local_connectors.py +0 -0
  204. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/types.py +0 -0
  205. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/utils/__init__.py +0 -0
  206. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/utils/schema.py +0 -0
  207. {hud_python-0.5.28 → hud_python-0.5.30}/hud/environment/utils/tool_wrappers.py +0 -0
  208. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/__init__.py +0 -0
  209. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/display.py +0 -0
  210. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/instrument.py +0 -0
  211. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/parallel.py +0 -0
  212. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/task.py +0 -0
  213. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/__init__.py +0 -0
  214. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/test_eval.py +0 -0
  215. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/test_parallel.py +0 -0
  216. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/tests/test_task.py +0 -0
  217. {hud_python-0.5.28 → hud_python-0.5.30}/hud/eval/utils.py +0 -0
  218. {hud_python-0.5.28 → hud_python-0.5.30}/hud/patches/__init__.py +0 -0
  219. {hud_python-0.5.28 → hud_python-0.5.30}/hud/patches/warnings.py +0 -0
  220. {hud_python-0.5.28 → hud_python-0.5.30}/hud/py.typed +0 -0
  221. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/__init__.py +0 -0
  222. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/context.py +0 -0
  223. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/helper/__init__.py +0 -0
  224. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/low_level.py +0 -0
  225. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/__init__.py +0 -0
  226. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_add_tool.py +0 -0
  227. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_context.py +0 -0
  228. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_mcp_server_handlers.py +0 -0
  229. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_mcp_server_integration.py +0 -0
  230. {hud_python-0.5.28 → hud_python-0.5.30}/hud/server/tests/test_run_wrapper.py +0 -0
  231. {hud_python-0.5.28/hud/shared → hud_python-0.5.30/hud/services}/tests/__init__.py +0 -0
  232. {hud_python-0.5.28 → hud_python-0.5.30}/hud/settings.py +0 -0
  233. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/__init__.py +0 -0
  234. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/exceptions.py +0 -0
  235. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/hints.py +0 -0
  236. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/requests.py +0 -0
  237. {hud_python-0.5.28/hud/telemetry → hud_python-0.5.30/hud/shared}/tests/__init__.py +0 -0
  238. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/tests/test_exceptions.py +0 -0
  239. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/tests/test_hints.py +0 -0
  240. {hud_python-0.5.28 → hud_python-0.5.30}/hud/shared/tests/test_requests.py +0 -0
  241. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/__init__.py +0 -0
  242. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/exporter.py +0 -0
  243. {hud_python-0.5.28/hud/utils → hud_python-0.5.30/hud/telemetry}/tests/__init__.py +0 -0
  244. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/tests/test_eval_telemetry.py +0 -0
  245. {hud_python-0.5.28 → hud_python-0.5.30}/hud/telemetry/tests/test_exporter.py +0 -0
  246. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/__init__.py +0 -0
  247. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/__init__.py +0 -0
  248. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/bash.py +0 -0
  249. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/edit.py +0 -0
  250. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/gemini_edit.py +0 -0
  251. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/gemini_shell.py +0 -0
  252. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/session.py +0 -0
  253. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/__init__.py +0 -0
  254. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_apply_patch.py +0 -0
  255. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_bash.py +0 -0
  256. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_bash_extended.py +0 -0
  257. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_bash_integration.py +0 -0
  258. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_edit.py +0 -0
  259. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_gemini_tools.py +0 -0
  260. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/tests/test_shell.py +0 -0
  261. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/coding/utils.py +0 -0
  262. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/__init__.py +0 -0
  263. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/gemini.py +0 -0
  264. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/glm.py +0 -0
  265. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/hud.py +0 -0
  266. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/qwen.py +0 -0
  267. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/settings.py +0 -0
  268. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/tests/__init__.py +0 -0
  269. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/tests/test_computer.py +0 -0
  270. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/tests/test_computer_actions.py +0 -0
  271. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/computer/tests/test_glm_computer.py +0 -0
  272. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/__init__.py +0 -0
  273. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/base.py +0 -0
  274. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/pyautogui.py +0 -0
  275. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/tests/__init__.py +0 -0
  276. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/tests/test_base_executor.py +0 -0
  277. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/tests/test_pyautogui_executor.py +0 -0
  278. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/executors/xdo.py +0 -0
  279. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/__init__.py +0 -0
  280. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/base.py +0 -0
  281. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/gemini.py +0 -0
  282. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/glob.py +0 -0
  283. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/grep.py +0 -0
  284. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/list.py +0 -0
  285. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/read.py +0 -0
  286. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/tests/__init__.py +0 -0
  287. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/tests/test_glob.py +0 -0
  288. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/tests/test_grep.py +0 -0
  289. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/tests/test_list.py +0 -0
  290. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/filesystem/tests/test_read.py +0 -0
  291. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/__init__.py +0 -0
  292. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/config.py +0 -0
  293. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/grounded_tool.py +0 -0
  294. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/grounder.py +0 -0
  295. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/tests/__init__.py +0 -0
  296. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/grounding/tests/test_grounded_tool.py +0 -0
  297. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/google_search.py +0 -0
  298. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/url_context.py +0 -0
  299. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/web_fetch.py +0 -0
  300. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/hosted/web_search.py +0 -0
  301. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/jupyter.py +0 -0
  302. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/__init__.py +0 -0
  303. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/base.py +0 -0
  304. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/claude.py +0 -0
  305. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/gemini.py +0 -0
  306. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/session.py +0 -0
  307. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/tests/__init__.py +0 -0
  308. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/tests/test_claude.py +0 -0
  309. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/tests/test_gemini.py +0 -0
  310. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/memory/tests/test_session.py +0 -0
  311. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/native_types.py +0 -0
  312. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/playwright.py +0 -0
  313. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/response.py +0 -0
  314. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/submit.py +0 -0
  315. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/__init__.py +0 -0
  316. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_agent_tool.py +0 -0
  317. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_init.py +0 -0
  318. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_jupyter_tool.py +0 -0
  319. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_playwright_tool.py +0 -0
  320. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_response.py +0 -0
  321. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_submit.py +0 -0
  322. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_tools.py +0 -0
  323. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_tools_init.py +0 -0
  324. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/tests/test_utils.py +0 -0
  325. {hud_python-0.5.28 → hud_python-0.5.30}/hud/tools/utils.py +0 -0
  326. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/__init__.py +0 -0
  327. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/env.py +0 -0
  328. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/hud_console.py +0 -0
  329. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/mcp.py +0 -0
  330. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/pretty_errors.py +0 -0
  331. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/strict_schema.py +0 -0
  332. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/tests/test_init.py +0 -0
  333. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/tests/test_pretty_errors.py +0 -0
  334. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/tests/test_tool_shorthand.py +0 -0
  335. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/tool_shorthand.py +0 -0
  336. {hud_python-0.5.28 → hud_python-0.5.30}/hud/utils/types.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hud-python
3
- Version: 0.5.28
3
+ Version: 0.5.30
4
4
  Summary: SDK for the HUD platform.
5
5
  Project-URL: Homepage, https://github.com/hud-evals/hud-python
6
6
  Project-URL: Bug Tracker, https://github.com/hud-evals/hud-python/issues
@@ -34,11 +34,12 @@ Classifier: Programming Language :: Python :: 3
34
34
  Classifier: Programming Language :: Python :: 3.11
35
35
  Classifier: Programming Language :: Python :: 3.12
36
36
  Requires-Python: <3.13,>=3.11
37
+ Requires-Dist: a2a-sdk>=0.3.24
37
38
  Requires-Dist: blessed>=1.20.0
38
- Requires-Dist: fastmcp==2.13.3
39
+ Requires-Dist: fastmcp==3.0.2
39
40
  Requires-Dist: httpx<1,>=0.23.0
40
- Requires-Dist: mcp<1.23,>1.21.1
41
- Requires-Dist: openai>=2.8.1
41
+ Requires-Dist: mcp<2.0,>=1.24.0
42
+ Requires-Dist: openai>=2.26.0
42
43
  Requires-Dist: packaging>=21.0
43
44
  Requires-Dist: prompt-toolkit==3.0.51
44
45
  Requires-Dist: pydantic-settings<3,>=2.2
@@ -51,7 +52,6 @@ Requires-Dist: typer>=0.9.0
51
52
  Requires-Dist: watchfiles>=0.21.0
52
53
  Provides-Extra: agent
53
54
  Requires-Dist: anthropic>=0.78.0; extra == 'agent'
54
- Requires-Dist: datasets>=2.14.0; extra == 'agent'
55
55
  Requires-Dist: google-genai; extra == 'agent'
56
56
  Requires-Dist: langchain>=1.1.0; extra == 'agent'
57
57
  Requires-Dist: mcp-use==1.5.0; extra == 'agent'
@@ -60,7 +60,6 @@ Requires-Dist: pillow>=11.1.0; extra == 'agent'
60
60
  Requires-Dist: tornado>=6.5.2; extra == 'agent'
61
61
  Provides-Extra: agents
62
62
  Requires-Dist: anthropic>=0.78.0; extra == 'agents'
63
- Requires-Dist: datasets>=2.14.0; extra == 'agents'
64
63
  Requires-Dist: google-genai; extra == 'agents'
65
64
  Requires-Dist: langchain>=1.1.0; extra == 'agents'
66
65
  Requires-Dist: mcp-use==1.5.0; extra == 'agents'
@@ -71,7 +70,6 @@ Provides-Extra: bedrock
71
70
  Requires-Dist: anthropic[bedrock]>=0.78.0; extra == 'bedrock'
72
71
  Provides-Extra: dev
73
72
  Requires-Dist: anthropic>=0.78.0; extra == 'dev'
74
- Requires-Dist: datasets>=2.14.0; extra == 'dev'
75
73
  Requires-Dist: dotenv>=0.9.9; extra == 'dev'
76
74
  Requires-Dist: google-adk; extra == 'dev'
77
75
  Requires-Dist: google-genai; extra == 'dev'
@@ -12,6 +12,7 @@ from . import patches as _patches # noqa: F401
12
12
  from .environment import Environment
13
13
  from .eval import EvalContext
14
14
  from .eval import run_eval as eval
15
+ from .services import Chat
15
16
  from .telemetry.instrument import instrument
16
17
 
17
18
 
@@ -30,6 +31,7 @@ def trace(*args: object, **kwargs: object) -> EvalContext:
30
31
 
31
32
 
32
33
  __all__ = [
34
+ "Chat",
33
35
  "Environment",
34
36
  "EvalContext",
35
37
  "eval",
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  import logging
8
+ import re
8
9
  from abc import ABC, abstractmethod
9
10
  from dataclasses import dataclass, field
10
11
  from typing import TYPE_CHECKING, Any, ClassVar, Literal
@@ -12,7 +13,8 @@ from typing import TYPE_CHECKING, Any, ClassVar, Literal
12
13
  import mcp.types as types
13
14
 
14
15
  from hud.tools.native_types import NativeToolSpec
15
- from hud.types import AgentResponse, AgentType, BaseAgentConfig, MCPToolCall, MCPToolResult, Trace
16
+ from hud.tools.types import Citation
17
+ from hud.types import AgentType, BaseAgentConfig, InferenceResult, MCPToolCall, MCPToolResult, Trace
16
18
  from hud.utils.hud_console import HUDConsole
17
19
 
18
20
  from .types import BaseCreateParams
@@ -417,21 +419,25 @@ class MCPAgent(ABC):
417
419
  await self._initialize_from_ctx(ctx)
418
420
 
419
421
  try:
420
- # Build initial context - optionally append setup tool output
421
- # Check ctx first (task-level override), then fall back to agent config
422
- append_setup = getattr(ctx, "append_setup_output", False) or getattr(
423
- self.config, "append_setup_output", False
424
- )
425
- initial_prompt = ctx.prompt
426
- if append_setup:
427
- setup_output = getattr(ctx, "setup_output", None)
428
- if setup_output:
429
- initial_prompt = f"{initial_prompt}\n\n{setup_output}"
422
+ # Build initial context
423
+ conversation: list[dict[str, str]] | None = getattr(ctx, "conversation", None)
430
424
 
431
- # Build initial blocks (text prompt + optional screenshot)
432
- initial_blocks = text_to_blocks(initial_prompt)
425
+ if conversation:
426
+ # Multi-turn: build alternating role messages
427
+ initial_messages = await self._build_conversation_messages(conversation)
428
+ else:
429
+ # Single-turn: single user message from prompt
430
+ append_setup = getattr(ctx, "append_setup_output", False) or getattr(
431
+ self.config, "append_setup_output", False
432
+ )
433
+ initial_prompt = ctx.prompt
434
+ if append_setup:
435
+ setup_output = getattr(ctx, "setup_output", None)
436
+ if setup_output:
437
+ initial_prompt = f"{initial_prompt}\n\n{setup_output}"
438
+ initial_messages = await self.format_message(initial_prompt)
433
439
 
434
- result = await self._run_context(initial_blocks, max_steps=max_steps)
440
+ result = await self._run_context(initial_messages, max_steps=max_steps)
435
441
 
436
442
  # Propagate error state to context for platform visibility
437
443
  if result.isError and hasattr(ctx, "error"):
@@ -440,7 +446,15 @@ class MCPAgent(ABC):
440
446
 
441
447
  # Submit final answer to context (only if scenario is running)
442
448
  if result.content and ctx.has_scenario:
443
- await ctx.submit(result.content)
449
+ if result.citations:
450
+ await ctx.submit(
451
+ {
452
+ "content": result.content,
453
+ "citations": result.citations,
454
+ }
455
+ )
456
+ else:
457
+ await ctx.submit(result.content)
444
458
 
445
459
  return result
446
460
 
@@ -460,30 +474,48 @@ class MCPAgent(ABC):
460
474
  # Cleanup auto-created resources
461
475
  await self._cleanup()
462
476
 
463
- async def _run_context(
464
- self, context: list[types.ContentBlock], *, max_steps: int = 10
465
- ) -> Trace:
477
+ def _map_role(self, role: str) -> str:
478
+ """Map a canonical role name to the provider-specific role.
479
+
480
+ Override in subclasses where the provider uses different role names.
481
+ Default passes through (works for OpenAI and Claude which use "assistant").
466
482
  """
467
- Run the agent with the given context messages. This is the core agent loop.
483
+ return role
484
+
485
+ async def _build_conversation_messages(self, conversation: list[dict[str, str]]) -> list[Any]:
486
+ """Build provider-formatted messages from a conversation history."""
487
+ result: list[Any] = []
488
+ for msg in conversation:
489
+ role = self._map_role(msg.get("role", "user"))
490
+ content = msg.get("content", "")
491
+ formatted = await self.format_message(content)
492
+ for fm in formatted:
493
+ if isinstance(fm, dict):
494
+ fm["role"] = role
495
+ elif hasattr(fm, "role"):
496
+ fm.role = role # type: ignore[attr-defined]
497
+ result.extend(formatted)
498
+ return result
499
+
500
+ async def _run_context(self, initial_messages: list[Any], *, max_steps: int = 10) -> Trace:
501
+ """
502
+ Run the agent with pre-built messages. This is the core agent loop.
468
503
 
469
504
  Args:
470
- context: The context to complete
505
+ initial_messages: Provider-formatted messages (from format_message or conversation)
471
506
  max_steps: Maximum number of steps (-1 for infinite)
472
507
 
473
508
  Returns:
474
509
  Trace with reward, done, content fields and trace steps
475
510
  """
476
- final_response = None
511
+ final_response: InferenceResult | None = None
477
512
  error = None
478
513
 
479
514
  messages: list[Any] = []
480
515
 
481
516
  try:
482
- # Start with system messages
483
517
  messages = await self.get_system_messages()
484
-
485
- # Add initial context
486
- messages.extend(await self.format_message(context))
518
+ messages.extend(initial_messages)
487
519
  self.console.debug(f"Messages: {messages}")
488
520
 
489
521
  step_count = 0
@@ -513,6 +545,19 @@ class MCPAgent(ABC):
513
545
  except Exception as e:
514
546
  self.console.warning_log(f"Auto-respond failed: {e}")
515
547
  if decision == "STOP":
548
+ if (
549
+ getattr(self.ctx, "scenario_enable_citations", False)
550
+ and not response.citations
551
+ ):
552
+ recovered = self._recover_citations_from_content(response)
553
+ if recovered:
554
+ self.console.info_log(
555
+ "Recovered citations from JSON answer payload"
556
+ )
557
+ else:
558
+ self.console.warning_log(
559
+ "Citations required by scenario but missing in final response" # noqa: E501
560
+ )
516
561
  self.console.debug("Stopping execution")
517
562
  final_response = response
518
563
  break
@@ -564,7 +609,6 @@ class MCPAgent(ABC):
564
609
  else:
565
610
  is_error = False
566
611
 
567
- # Ensure all parameters are the correct type
568
612
  # Use ctx.reward if already set (e.g., from scenario evaluate), otherwise 0.0
569
613
  # Note: For v4 tasks with evaluate_tool, reward is set in __aexit__ after this returns,
570
614
  # so callers should prefer ctx.reward over Trace.reward for the final result.
@@ -574,17 +618,81 @@ class MCPAgent(ABC):
574
618
  if ctx_reward is not None:
575
619
  reward = ctx_reward
576
620
 
577
- trace_params = {
578
- "reward": reward,
579
- "done": True,
580
- "messages": messages,
581
- "content": final_response.content if final_response else error,
582
- "isError": is_error,
583
- "info": {"error": error} if error else {},
584
- }
585
- trace_result = Trace(**trace_params)
621
+ return Trace(
622
+ reward=reward,
623
+ done=True,
624
+ messages=messages,
625
+ content=final_response.content if final_response else error,
626
+ isError=is_error,
627
+ citations=final_response.citations if final_response else [],
628
+ info={"error": error} if error else {},
629
+ )
586
630
 
587
- return trace_result
631
+ def _recover_citations_from_content(self, response: InferenceResult) -> bool:
632
+ """Try to extract citations from model content when native citations are missing.
633
+
634
+ Handles two cases: raw JSON content and fenced ```json blocks.
635
+ """
636
+ raw = response.content or ""
637
+ if not raw:
638
+ return False
639
+
640
+ # Try raw content first, then try extracting from fenced block.
641
+ for text in dict.fromkeys([raw, self._extract_fenced_json(raw) or ""]):
642
+ if not text:
643
+ continue
644
+ try:
645
+ parsed = json.loads(text)
646
+ except (json.JSONDecodeError, TypeError):
647
+ continue
648
+ if not isinstance(parsed, dict):
649
+ continue
650
+
651
+ raw_citations = parsed.get("citations")
652
+ if not isinstance(raw_citations, list) or not raw_citations:
653
+ continue
654
+
655
+ normalized: list[Citation] = [
656
+ c
657
+ for cit in raw_citations
658
+ if isinstance(cit, dict) and (c := self._normalize_citation(cit)) is not None
659
+ ]
660
+ if not normalized:
661
+ continue
662
+
663
+ content = parsed.get("content")
664
+ if isinstance(content, str) and content.strip():
665
+ response.content = content
666
+ response.citations = [c.model_dump(exclude={"provider_data"}) for c in normalized]
667
+ return True
668
+
669
+ return False
670
+
671
+ @staticmethod
672
+ def _extract_fenced_json(value: str) -> str | None:
673
+ """Extract JSON content from a fenced code block."""
674
+ match = re.search(r"```(?:json)?\s*\n(.*?)```", value, re.DOTALL)
675
+ return match.group(1).strip() if match else None
676
+
677
+ @staticmethod
678
+ def _normalize_citation(cit: dict[str, Any]) -> Citation | None:
679
+ """Normalize a citation dict to canonical Citation shape.
680
+
681
+ Maps common key aliases to canonical names and validates via Citation.
682
+ Returns None only if construction fails (e.g. extra-forbid violation).
683
+ """
684
+ source = cit.get("source") or cit.get("document") or ""
685
+ try:
686
+ return Citation(
687
+ type=cit.get("type", "document_citation"),
688
+ text=cit.get("text") or cit.get("cited_text", ""),
689
+ source=str(source),
690
+ title=cit.get("title") or cit.get("document_title"),
691
+ start_index=cit.get("start_index", cit.get("start_char_index")),
692
+ end_index=cit.get("end_index", cit.get("end_char_index")),
693
+ )
694
+ except Exception:
695
+ return None
588
696
 
589
697
  async def call_tools(
590
698
  self, tool_call: MCPToolCall | list[MCPToolCall] | None = None
@@ -629,7 +737,7 @@ class MCPAgent(ABC):
629
737
  raise NotImplementedError
630
738
 
631
739
  @abstractmethod
632
- async def get_response(self, messages: list[Any]) -> AgentResponse:
740
+ async def get_response(self, messages: list[Any]) -> InferenceResult:
633
741
  """
634
742
  Get response from the model including any tool calls.
635
743
 
@@ -756,8 +864,10 @@ def _parse_spec_dict(spec_dict: dict[str, Any]) -> NativeToolSpec | None:
756
864
  """Parse a dict (from MCP meta) into a NativeToolSpec."""
757
865
  if not spec_dict:
758
866
  return None
759
- known_fields = {"api_type", "api_name", "beta", "hosted", "role", "supported_models"}
867
+ known_fields = {"api_type", "api_name", "beta", "hosted", "role", "supported_models", "extra"}
760
868
  extra = {k: v for k, v in spec_dict.items() if k not in known_fields}
869
+ if isinstance(spec_dict.get("extra"), dict):
870
+ extra.update(spec_dict["extra"])
761
871
  supported_models_raw = spec_dict.get("supported_models")
762
872
  supported_models: tuple[str, ...] | None = None
763
873
  if supported_models_raw:
@@ -17,6 +17,7 @@ from anthropic.types.beta import (
17
17
  BetaContentBlockParam,
18
18
  BetaImageBlockParam,
19
19
  BetaMessageParam,
20
+ BetaPlainTextSourceParam,
20
21
  BetaRequestDocumentBlockParam,
21
22
  BetaTextBlockParam,
22
23
  BetaToolBash20250124Param,
@@ -31,7 +32,7 @@ from anthropic.types.beta import (
31
32
  from hud.settings import settings
32
33
  from hud.tools.computer.settings import computer_settings
33
34
  from hud.tools.native_types import NativeToolSpec
34
- from hud.types import AgentResponse, AgentType, BaseAgentConfig, MCPToolCall, MCPToolResult
35
+ from hud.types import AgentType, BaseAgentConfig, InferenceResult, MCPToolCall, MCPToolResult
35
36
  from hud.utils.hud_console import HUDConsole
36
37
  from hud.utils.types import with_signature
37
38
 
@@ -155,9 +156,11 @@ class ClaudeAgent(MCPAgent):
155
156
 
156
157
  # these will be initialized in _convert_tools_for_claude
157
158
  self.has_computer_tool = False
158
- self.tool_mapping = {}
159
- self.claude_tools = []
160
- self._required_betas = set()
159
+ self.tool_mapping: dict[str, str] = {}
160
+ self.claude_tools: list[BetaToolUnionParam] = []
161
+ self._required_betas: set[str] = set()
162
+ self._tool_search_threshold: int | None = None
163
+ self._gated_screenshot_tools: set[str] = set()
161
164
 
162
165
  def _on_tools_ready(self) -> None:
163
166
  """Build Claude-specific tool mappings after tools are discovered."""
@@ -167,6 +170,67 @@ class ClaudeAgent(MCPAgent):
167
170
  """No system messages for Claude because applied in get_response"""
168
171
  return []
169
172
 
173
+ def _result_from_response_blocks(self, response_blocks: list[Any]) -> InferenceResult:
174
+ """Extract text/tool calls/citations from Anthropic response blocks."""
175
+ result = InferenceResult(content="", tool_calls=[], done=True)
176
+ text_content = ""
177
+ thinking_content = ""
178
+ citations: list[dict[str, Any]] = []
179
+
180
+ for block in response_blocks:
181
+ block_type = getattr(block, "type", None)
182
+ if block_type == "tool_use":
183
+ block_input = getattr(block, "input", {})
184
+ mcp_name = self.tool_mapping.get(
185
+ getattr(block, "name", ""),
186
+ getattr(block, "name", ""),
187
+ )
188
+ arguments = block_input if isinstance(block_input, dict) else block_input.__dict__
189
+ if mcp_name in self._gated_screenshot_tools:
190
+ arguments = {**arguments, "take_screenshot_on_click": False}
191
+ logger.debug(
192
+ "Injected take_screenshot_on_click=False for gated tool %s", mcp_name
193
+ )
194
+ tool_call = MCPToolCall(
195
+ id=getattr(block, "id", ""),
196
+ name=mcp_name,
197
+ arguments=arguments,
198
+ )
199
+ result.tool_calls.append(tool_call)
200
+ result.done = False
201
+ elif block_type == "text":
202
+ text = getattr(block, "text", "") or ""
203
+ text_content += text
204
+ block_citations = getattr(block, "citations", None) or []
205
+ for cit in block_citations:
206
+ cit_dict = {
207
+ "type": "document_citation",
208
+ "text": getattr(cit, "cited_text", "") or "",
209
+ "source": (
210
+ str(idx)
211
+ if (idx := getattr(cit, "document_index", None)) is not None
212
+ else getattr(cit, "document_title", "") or ""
213
+ ),
214
+ "title": getattr(cit, "document_title", None),
215
+ "start_index": getattr(cit, "start_char_index", None),
216
+ "end_index": getattr(cit, "end_char_index", None),
217
+ }
218
+ normalized = self._normalize_citation(cit_dict)
219
+ if normalized is not None:
220
+ citations.append(normalized.model_dump(exclude={"provider_data"}))
221
+ elif block_type == "thinking":
222
+ thinking = getattr(block, "thinking", "") or ""
223
+ if thinking:
224
+ if thinking_content:
225
+ thinking_content += "\n"
226
+ thinking_content += thinking
227
+
228
+ result.content = text_content
229
+ result.citations = citations
230
+ if thinking_content:
231
+ result.reasoning = thinking_content
232
+ return result
233
+
170
234
  async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[BetaMessageParam]:
171
235
  """Format messages for Claude."""
172
236
  # Convert MCP content types to Anthropic content types
@@ -234,7 +298,7 @@ class ClaudeAgent(MCPAgent):
234
298
  content=[text_to_content_block(retry_text)],
235
299
  )
236
300
 
237
- async def get_response(self, messages: list[BetaMessageParam]) -> AgentResponse:
301
+ async def get_response(self, messages: list[BetaMessageParam]) -> InferenceResult:
238
302
  """Get response from Claude including any tool calls."""
239
303
  messages_cached = self._add_prompt_caching(messages)
240
304
  # betas to use - collected during tool conversion based on native specs
@@ -242,6 +306,24 @@ class ClaudeAgent(MCPAgent):
242
306
  # anthropic-beta header which the API rejects.
243
307
  betas: list[str] | Omit = list(self._required_betas) if self._required_betas else Omit()
244
308
 
309
+ effective_tools: list[BetaToolUnionParam] = list(self.claude_tools)
310
+ if self._tool_search_threshold is not None:
311
+ generic_count = sum(
312
+ 1 for t in effective_tools if isinstance(t, dict) and "input_schema" in t
313
+ )
314
+ if generic_count > self._tool_search_threshold:
315
+ logger.debug(
316
+ "tool_search: %d generic tools > threshold %d, applying defer_loading",
317
+ generic_count,
318
+ self._tool_search_threshold,
319
+ )
320
+ effective_tools = [
321
+ {**t, "defer_loading": True}
322
+ if isinstance(t, dict) and "input_schema" in t
323
+ else t
324
+ for t in effective_tools
325
+ ]
326
+
245
327
  # Bedrock doesn't support .stream() - use create(stream=True) instead
246
328
  if isinstance(self.anthropic_client, AsyncAnthropicBedrock):
247
329
  try:
@@ -250,7 +332,7 @@ class ClaudeAgent(MCPAgent):
250
332
  system=self.system_prompt if self.system_prompt is not None else Omit(),
251
333
  max_tokens=self.max_tokens,
252
334
  messages=messages_cached,
253
- tools=self.claude_tools,
335
+ tools=effective_tools,
254
336
  tool_choice={"type": "auto", "disable_parallel_tool_use": True},
255
337
  betas=betas,
256
338
  )
@@ -271,7 +353,7 @@ class ClaudeAgent(MCPAgent):
271
353
  system=self.system_prompt if self.system_prompt is not None else Omit(),
272
354
  max_tokens=self.max_tokens,
273
355
  messages=messages_cached,
274
- tools=self.claude_tools,
356
+ tools=effective_tools,
275
357
  tool_choice={"type": "auto", "disable_parallel_tool_use": True},
276
358
  betas=betas,
277
359
  ) as stream:
@@ -315,34 +397,7 @@ class ClaudeAgent(MCPAgent):
315
397
  raise ValueError("Claude response missing after stream retries")
316
398
 
317
399
  # Process response
318
- result = AgentResponse(content="", tool_calls=[], done=True)
319
-
320
- # Extract text content and reasoning
321
- text_content = ""
322
- thinking_content = ""
323
-
324
- for block in response.content:
325
- if block.type == "tool_use":
326
- tool_call = MCPToolCall(
327
- id=block.id,
328
- # look up name in tool_mapping if available, otherwise use block name
329
- name=self.tool_mapping.get(block.name, block.name),
330
- arguments=block.input
331
- if isinstance(block.input, dict)
332
- else block.input.__dict__,
333
- )
334
- result.tool_calls.append(tool_call)
335
- result.done = False
336
- elif block.type == "text":
337
- text_content += block.text
338
- elif hasattr(block, "type") and block.type == "thinking":
339
- if thinking_content:
340
- thinking_content += "\n"
341
- thinking_content += block.thinking
342
-
343
- result.content = text_content
344
- if thinking_content:
345
- result.reasoning = thinking_content
400
+ result = self._result_from_response_blocks(list(response.content))
346
401
 
347
402
  return result
348
403
 
@@ -353,23 +408,28 @@ class ClaudeAgent(MCPAgent):
353
408
 
354
409
  Handles EmbeddedResource (PDFs), images, and text content.
355
410
  """
411
+ citations_enabled = bool(
412
+ getattr(self.ctx, "scenario_enable_citations", False) if self.ctx else False
413
+ )
414
+
356
415
  # Process each tool result
357
- user_content = []
416
+ user_content: list[BetaToolResultBlockParam | BetaRequestDocumentBlockParam] = []
358
417
 
359
418
  for tool_call, result in zip(tool_calls, tool_results, strict=True):
360
- # Extract Claude-specific metadata from extra fields
361
419
  tool_use_id = tool_call.id
362
420
  if not tool_use_id:
363
421
  self.hud_console.warning(f"No tool_use_id found for {tool_call.name}")
364
422
  continue
365
423
 
366
- # Convert MCP tool results to Claude format
424
+ # Blocks placed inside the tool_result (text, images)
367
425
  claude_blocks: list[
368
426
  BetaTextBlockParam | BetaImageBlockParam | BetaRequestDocumentBlockParam
369
427
  ] = []
428
+ # Citable document blocks placed as siblings after the tool_result
429
+ # so Claude's citation system indexes them properly.
430
+ sibling_docs: list[BetaRequestDocumentBlockParam] = []
370
431
 
371
432
  if result.isError:
372
- # Extract error message from content
373
433
  error_msg = "Tool execution failed"
374
434
  for content in result.content:
375
435
  if isinstance(content, types.TextContent):
@@ -377,27 +437,37 @@ class ClaudeAgent(MCPAgent):
377
437
  break
378
438
  claude_blocks.append(text_to_content_block(f"Error: {error_msg}"))
379
439
  else:
380
- # Process success content
381
440
  for content in result.content:
382
441
  if isinstance(content, types.TextContent):
383
442
  claude_blocks.append(text_to_content_block(content.text))
443
+ if citations_enabled:
444
+ sibling_docs.append(
445
+ text_document_block(content.text, title=tool_call.name)
446
+ )
384
447
  elif isinstance(content, types.ImageContent):
385
448
  claude_blocks.append(base64_to_content_block(content.data))
386
449
  elif isinstance(content, types.EmbeddedResource):
387
- # Handle embedded resources (PDFs)
388
450
  resource = content.resource
389
451
  if (
390
452
  isinstance(resource, types.BlobResourceContents)
391
453
  and resource.mimeType == "application/pdf"
392
454
  ):
393
455
  claude_blocks.append(
394
- document_to_content_block(base64_data=resource.blob)
456
+ document_to_content_block(
457
+ base64_data=resource.blob,
458
+ )
395
459
  )
460
+ if citations_enabled:
461
+ sibling_docs.append(
462
+ document_to_content_block(
463
+ base64_data=resource.blob,
464
+ enable_citations=True,
465
+ )
466
+ )
396
467
 
397
- # Add tool result
398
468
  user_content.append(tool_use_content_block(tool_use_id, claude_blocks))
469
+ user_content.extend(sibling_docs)
399
470
 
400
- # Return as a user message containing all tool results
401
471
  return [
402
472
  BetaMessageParam(
403
473
  role="user",
@@ -418,12 +488,28 @@ class ClaudeAgent(MCPAgent):
418
488
  self.tool_mapping: dict[str, str] = {}
419
489
  self.claude_tools: list[BetaToolUnionParam] = []
420
490
  self._required_betas: set[str] = set()
491
+ self._tool_search_threshold = None
492
+ self._gated_screenshot_tools: set[str] = set()
421
493
 
422
494
  categorized = self._categorized_tools
423
495
 
424
- # Log skipped hosted tools (Claude doesn't support hosted tools currently)
425
- for tool, _spec in categorized.hosted:
426
- logger.debug("Skipping hosted tool %s for Claude", tool.name)
496
+ # Process hosted tools
497
+ for tool, spec in categorized.hosted:
498
+ if not spec.api_type:
499
+ logger.debug("Skipping hosted tool %s: no api_type", tool.name)
500
+ continue
501
+ tool_def: dict[str, Any] = {
502
+ "type": spec.api_type,
503
+ "name": spec.api_name or tool.name,
504
+ }
505
+ api_extra = {k: v for k, v in spec.extra.items() if k != "threshold"}
506
+ tool_def.update(api_extra)
507
+ if spec.beta:
508
+ self._required_betas.add(spec.beta)
509
+ if "threshold" in spec.extra:
510
+ self._tool_search_threshold = spec.extra["threshold"]
511
+ self.claude_tools.append(tool_def) # type: ignore[arg-type]
512
+ logger.debug("Added hosted tool %s (%s) for Claude", tool.name, spec.api_type)
427
513
 
428
514
  # Process native tools
429
515
  for tool, spec in categorized.native:
@@ -437,6 +523,9 @@ class ClaudeAgent(MCPAgent):
437
523
 
438
524
  if spec.api_type and spec.api_type.startswith("computer"):
439
525
  self.has_computer_tool = True
526
+ if spec.api_type == "computer_20251124":
527
+ self._gated_screenshot_tools.add(tool.name)
528
+ logger.debug("Screenshot gating enabled for tool %s (computer_20251124)", tool.name)
440
529
 
441
530
  # Process generic tools
442
531
  for tool in categorized.generic:
@@ -611,9 +700,27 @@ def text_to_content_block(text: str) -> BetaTextBlockParam:
611
700
  return {"type": "text", "text": text}
612
701
 
613
702
 
614
- def document_to_content_block(base64_data: str) -> BetaRequestDocumentBlockParam:
703
+ def text_document_block(text: str, *, title: str | None = None) -> BetaRequestDocumentBlockParam:
704
+ """Wrap plain text as a citable document block."""
705
+ block = BetaRequestDocumentBlockParam(
706
+ type="document",
707
+ source=BetaPlainTextSourceParam(
708
+ type="text",
709
+ media_type="text/plain",
710
+ data=text,
711
+ ),
712
+ citations={"enabled": True},
713
+ )
714
+ if title:
715
+ block["title"] = title
716
+ return block
717
+
718
+
719
+ def document_to_content_block(
720
+ base64_data: str, *, enable_citations: bool = False
721
+ ) -> BetaRequestDocumentBlockParam:
615
722
  """Convert base64 PDF to Claude document content block."""
616
- return BetaRequestDocumentBlockParam(
723
+ block = BetaRequestDocumentBlockParam(
617
724
  type="document",
618
725
  source=BetaBase64PDFSourceParam(
619
726
  type="base64",
@@ -621,6 +728,9 @@ def document_to_content_block(base64_data: str) -> BetaRequestDocumentBlockParam
621
728
  data=base64_data,
622
729
  ),
623
730
  )
731
+ if enable_citations:
732
+ block["citations"] = {"enabled": True}
733
+ return block
624
734
 
625
735
 
626
736
  def tool_use_content_block(