hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,11 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import TYPE_CHECKING, cast
5
+ from typing import TYPE_CHECKING, Any, cast
6
6
  from unittest.mock import AsyncMock, MagicMock, patch
7
7
 
8
8
  import pytest
9
- from anthropic import BadRequestError
9
+ from anthropic import AsyncAnthropic, AsyncAnthropicBedrock
10
10
  from mcp import types
11
11
 
12
12
  from hud.agents.claude import (
@@ -15,18 +15,96 @@ from hud.agents.claude import (
15
15
  text_to_content_block,
16
16
  tool_use_content_block,
17
17
  )
18
+ from hud.environment.router import ToolRouter
19
+ from hud.eval.context import EvalContext
18
20
  from hud.types import MCPToolCall, MCPToolResult
19
21
 
20
22
  if TYPE_CHECKING:
23
+ from collections.abc import Generator
24
+
21
25
  from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
22
26
 
23
27
 
28
+ class MockEvalContext(EvalContext):
29
+ """Mock EvalContext for testing."""
30
+
31
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
32
+ # Core attributes
33
+ self.prompt = "Test prompt"
34
+ self._tools = tools or []
35
+ self._submitted: str | None = None
36
+ self.reward: float | None = None
37
+
38
+ # Environment attributes
39
+ self._router = ToolRouter()
40
+ self._agent_include: list[str] | None = None
41
+ self._agent_exclude: list[str] | None = None
42
+
43
+ # EvalContext attributes
44
+ self._task = None
45
+ self.trace_id = "test-trace-id"
46
+ self.eval_name = "test-eval"
47
+ self.job_id: str | None = None
48
+ self.group_id: str | None = None
49
+ self.index = 0
50
+ self.variants: dict[str, Any] = {}
51
+ self.answer: str | None = None
52
+ self.system_prompt: str | None = None
53
+ self.error: BaseException | None = None
54
+ self.metadata: dict[str, Any] = {}
55
+ self.results: list[Any] = []
56
+ self._is_summary = False
57
+
58
+ def as_tools(self) -> list[types.Tool]:
59
+ return self._tools
60
+
61
+ @property
62
+ def has_scenario(self) -> bool:
63
+ return False
64
+
65
+ async def list_tools(self) -> list[types.Tool]:
66
+ return self._tools
67
+
68
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
69
+ return MCPToolResult(
70
+ content=[types.TextContent(type="text", text="ok")],
71
+ isError=False,
72
+ )
73
+
74
+ async def submit(self, answer: str) -> None:
75
+ self._submitted = answer
76
+
77
+
78
+ class MockStreamContextManager:
79
+ """Mock for Claude's streaming context manager."""
80
+
81
+ def __init__(self, response: MagicMock) -> None:
82
+ self.response = response
83
+
84
+ async def __aenter__(self) -> MockStreamContextManager:
85
+ return self
86
+
87
+ async def __aexit__(
88
+ self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
89
+ ) -> bool:
90
+ return False
91
+
92
+ def __aiter__(self) -> MockStreamContextManager:
93
+ return self
94
+
95
+ async def __anext__(self) -> None:
96
+ raise StopAsyncIteration
97
+
98
+ async def get_final_message(self) -> MagicMock:
99
+ return self.response
100
+
101
+
24
102
  class TestClaudeHelperFunctions:
25
103
  """Test helper functions for Claude message formatting."""
26
104
 
27
- def test_base64_to_content_block(self):
105
+ def test_base64_to_content_block(self) -> None:
28
106
  """Test base64 image conversion."""
29
- base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==" # noqa: E501
107
+ base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk"
30
108
  result = base64_to_content_block(base64_data)
31
109
 
32
110
  assert result["type"] == "image"
@@ -34,7 +112,7 @@ class TestClaudeHelperFunctions:
34
112
  assert result["source"]["media_type"] == "image/png"
35
113
  assert result["source"]["data"] == base64_data
36
114
 
37
- def test_text_to_content_block(self):
115
+ def test_text_to_content_block(self) -> None:
38
116
  """Test text conversion."""
39
117
  text = "Hello, world!"
40
118
  result = text_to_content_block(text)
@@ -42,7 +120,7 @@ class TestClaudeHelperFunctions:
42
120
  assert result["type"] == "text"
43
121
  assert result["text"] == text
44
122
 
45
- def test_tool_use_content_block(self):
123
+ def test_tool_use_content_block(self) -> None:
46
124
  """Test tool result content block creation."""
47
125
  tool_use_id = "tool_123"
48
126
  content: list[BetaTextBlockParam | BetaImageBlockParam] = [
@@ -60,192 +138,331 @@ class TestClaudeAgent:
60
138
  """Test ClaudeAgent class."""
61
139
 
62
140
  @pytest.fixture
63
- def mock_mcp_client(self):
64
- """Create a mock MCP client."""
65
- mcp_client = MagicMock()
66
- return mcp_client
67
-
68
- @pytest.fixture
69
- def mock_anthropic(self):
70
- """Create a mock Anthropic client."""
71
- with patch("hud.agents.claude.AsyncAnthropic") as mock:
72
- client = AsyncMock()
73
- # Add beta attribute with messages
74
- client.beta = AsyncMock()
75
- client.beta.messages = AsyncMock()
76
- mock.return_value = client
77
- yield client
141
+ def mock_anthropic(self) -> Generator[AsyncAnthropic, None, None]: # type: ignore[misc]
142
+ """Create a stub Anthropic client."""
143
+ with patch("hud.agents.claude.AsyncAnthropic") as mock_class:
144
+ client = MagicMock(spec=AsyncAnthropic)
145
+ client.api_key = "test-key"
146
+ mock_class.return_value = client
147
+ yield client # type: ignore[misc]
78
148
 
79
149
  @pytest.mark.asyncio
80
- async def test_init(self, mock_mcp_client, mock_anthropic):
81
- """Test agent initialization."""
82
- # Test with provided model_client
83
- mock_model_client = MagicMock()
84
- agent = ClaudeAgent(
85
- mcp_client=mock_mcp_client,
86
- model_client=mock_model_client,
87
- model="claude-3-opus-20240229",
88
- max_tokens=1000,
89
- validate_api_key=False, # Skip validation in tests
150
+ async def test_init_with_client(self, mock_anthropic: AsyncAnthropic) -> None:
151
+ """Test agent initialization with provided client."""
152
+ agent = ClaudeAgent.create(
153
+ model_client=mock_anthropic,
154
+ model="claude-sonnet-4-20250514",
155
+ validate_api_key=False,
90
156
  )
91
157
 
92
- assert agent.model_name == "claude-3-opus-20240229"
93
- assert agent.max_tokens == 1000
94
- assert agent.anthropic_client == mock_model_client
158
+ assert agent.model_name == "Claude"
159
+ assert agent.config.model == "claude-sonnet-4-20250514"
160
+ assert agent.anthropic_client == mock_anthropic
95
161
 
96
162
  @pytest.mark.asyncio
97
- async def test_init_without_model_client(self, mock_mcp_client, mock_anthropic):
98
- """Test agent initialization without model client."""
99
- with patch("hud.settings.settings.anthropic_api_key", "test_key"):
100
- agent = ClaudeAgent(
101
- mcp_client=mock_mcp_client,
102
- model="claude-3-opus-20240229",
103
- validate_api_key=False, # Skip validation in tests
104
- )
163
+ async def test_init_with_parameters(self, mock_anthropic: AsyncAnthropic) -> None:
164
+ """Test agent initialization with various parameters."""
165
+ agent = ClaudeAgent.create(
166
+ model_client=mock_anthropic,
167
+ model="claude-sonnet-4-20250514",
168
+ max_tokens=4096,
169
+ validate_api_key=False,
170
+ )
105
171
 
106
- assert agent.model_name == "claude-3-opus-20240229"
107
- assert agent.anthropic_client is not None
172
+ assert agent.max_tokens == 4096
108
173
 
109
174
  @pytest.mark.asyncio
110
- async def test_format_blocks(self, mock_mcp_client):
111
- """Test formatting content blocks into Claude messages."""
112
- mock_model_client = MagicMock()
113
- agent = ClaudeAgent(
114
- mcp_client=mock_mcp_client,
115
- model_client=mock_model_client,
116
- validate_api_key=False, # Skip validation in tests
175
+ async def test_format_blocks_text_only(self, mock_anthropic: AsyncAnthropic) -> None:
176
+ """Test formatting text content blocks."""
177
+ agent = ClaudeAgent.create(
178
+ model_client=mock_anthropic,
179
+ validate_api_key=False,
117
180
  )
118
181
 
119
- # Test with text only
120
- text_blocks: list[types.ContentBlock] = [
121
- types.TextContent(type="text", text="Hello, Claude!")
182
+ blocks: list[types.ContentBlock] = [
183
+ types.TextContent(type="text", text="Hello, world!"),
184
+ types.TextContent(type="text", text="How are you?"),
122
185
  ]
123
- messages = await agent.format_blocks(text_blocks)
186
+
187
+ messages = await agent.format_blocks(blocks)
124
188
  assert len(messages) == 1
125
189
  assert messages[0]["role"] == "user"
126
190
  content = messages[0]["content"]
127
191
  assert isinstance(content, list)
128
- assert len(content) == 1
129
- assert content[0]["type"] == "text"
130
- assert content[0]["text"] == "Hello, Claude!"
192
+ assert len(content) == 2
193
+ assert content[0]["type"] == "text" # type: ignore[index]
194
+ assert content[0]["text"] == "Hello, world!" # type: ignore[index]
131
195
 
132
- # Test with screenshot
133
- image_blocks: list[types.ContentBlock] = [
134
- types.TextContent(type="text", text="Look at this"),
196
+ @pytest.mark.asyncio
197
+ async def test_format_blocks_with_image(self, mock_anthropic: AsyncAnthropic) -> None:
198
+ """Test formatting image content blocks."""
199
+ agent = ClaudeAgent.create(
200
+ model_client=mock_anthropic,
201
+ validate_api_key=False,
202
+ )
203
+
204
+ blocks: list[types.ContentBlock] = [
205
+ types.TextContent(type="text", text="Look at this:"),
135
206
  types.ImageContent(type="image", data="base64data", mimeType="image/png"),
136
207
  ]
137
- messages = await agent.format_blocks(image_blocks)
208
+
209
+ messages = await agent.format_blocks(blocks)
138
210
  assert len(messages) == 1
139
- assert messages[0]["role"] == "user"
140
211
  content = messages[0]["content"]
141
212
  assert isinstance(content, list)
142
213
  assert len(content) == 2
143
- # Content blocks are in order
144
- assert content[0]["type"] == "text"
145
- assert content[0]["text"] == "Look at this"
146
- assert content[1]["type"] == "image"
147
- assert content[1]["source"]["data"] == "base64data"
214
+ assert content[1]["type"] == "image" # type: ignore[index]
148
215
 
149
216
  @pytest.mark.asyncio
150
- async def test_format_tool_results_method(self, mock_mcp_client):
151
- """Test the agent's format_tool_results method."""
152
- mock_model_client = MagicMock()
153
- agent = ClaudeAgent(
154
- mcp_client=mock_mcp_client,
155
- model_client=mock_model_client,
156
- validate_api_key=False, # Skip validation in tests
217
+ async def test_format_tool_results_text(self, mock_anthropic: AsyncAnthropic) -> None:
218
+ """Test formatting tool results with text content."""
219
+ agent = ClaudeAgent.create(
220
+ model_client=mock_anthropic,
221
+ validate_api_key=False,
157
222
  )
158
223
 
159
- tool_calls = [
160
- MCPToolCall(name="test_tool", arguments={}, id="id1"),
161
- ]
162
-
224
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
163
225
  tool_results = [
164
- MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
226
+ MCPToolResult(
227
+ content=[types.TextContent(type="text", text="Tool output")],
228
+ isError=False,
229
+ )
165
230
  ]
166
231
 
167
232
  messages = await agent.format_tool_results(tool_calls, tool_results)
168
-
169
- # format_tool_results returns a single user message with tool result content
170
233
  assert len(messages) == 1
171
234
  assert messages[0]["role"] == "user"
172
- # The content is wrapped in a tool result block
173
- content = list(messages[0]["content"])
235
+ content = messages[0]["content"]
236
+ assert isinstance(content, list)
174
237
  assert len(content) == 1
175
- assert content[0]["type"] == "tool_result" # type: ignore
176
- assert content[0]["tool_use_id"] == "id1" # type: ignore
177
- # The actual content is nested inside
178
- inner_content = list(content[0]["content"]) # type: ignore
179
- assert inner_content[0]["type"] == "text" # type: ignore
180
- assert inner_content[0]["text"] == "Success" # type: ignore
238
+ assert content[0]["type"] == "tool_result" # type: ignore[index]
239
+ assert content[0]["tool_use_id"] == "call_123" # type: ignore[index]
240
+
241
+ @pytest.mark.asyncio
242
+ async def test_format_tool_results_with_error(self, mock_anthropic: AsyncAnthropic) -> None:
243
+ """Test formatting tool results with error."""
244
+ agent = ClaudeAgent.create(
245
+ model_client=mock_anthropic,
246
+ validate_api_key=False,
247
+ )
248
+
249
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
250
+ tool_results = [
251
+ MCPToolResult(
252
+ content=[types.TextContent(type="text", text="Error message")],
253
+ isError=True,
254
+ )
255
+ ]
256
+
257
+ messages = await agent.format_tool_results(tool_calls, tool_results)
258
+ assert len(messages) == 1
259
+ content = messages[0]["content"]
260
+ # Error content should include "Error:" prefix
261
+ assert any("Error" in str(block) for block in content[0]["content"]) # type: ignore[index]
262
+
263
+ @pytest.mark.asyncio
264
+ async def test_get_system_messages(self, mock_anthropic: AsyncAnthropic) -> None:
265
+ """Test that system messages return empty (Claude uses system param)."""
266
+ agent = ClaudeAgent.create(
267
+ model_client=mock_anthropic,
268
+ system_prompt="You are a helpful assistant.",
269
+ validate_api_key=False,
270
+ )
271
+
272
+ messages = await agent.get_system_messages()
273
+ # Claude doesn't use system messages in the message list
274
+ assert messages == []
181
275
 
182
276
  @pytest.mark.asyncio
183
- async def test_get_response(self, mock_mcp_client, mock_anthropic):
184
- """Test getting model response from Claude API."""
185
- # Disable telemetry for this test to avoid backend configuration issues
277
+ async def test_get_response_with_thinking(self, mock_anthropic: AsyncAnthropic) -> None:
278
+ """Test getting model response with thinking content."""
186
279
  with patch("hud.settings.settings.telemetry_enabled", False):
187
- agent = ClaudeAgent(
188
- mcp_client=mock_mcp_client,
280
+ agent = ClaudeAgent.create(
189
281
  model_client=mock_anthropic,
190
- validate_api_key=False, # Skip validation in tests
282
+ validate_api_key=False,
191
283
  )
284
+ # Set up agent as initialized
285
+ agent.claude_tools = []
286
+ agent.tool_mapping = {}
287
+ agent.has_computer_tool = False
288
+ agent._initialized = True
192
289
 
193
- # Mock the API response
194
290
  mock_response = MagicMock()
195
291
 
196
- # Create text block
292
+ thinking_block = MagicMock()
293
+ thinking_block.type = "thinking"
294
+ thinking_block.thinking = "Let me analyze this problem..."
295
+
197
296
  text_block = MagicMock()
198
297
  text_block.type = "text"
199
- text_block.text = "Hello!"
298
+ text_block.text = "Here is the answer"
200
299
 
201
- # Create tool use block
202
- tool_block = MagicMock()
203
- tool_block.type = "tool_use"
204
- tool_block.id = "tool_123"
205
- tool_block.name = "test_tool"
206
- tool_block.input = {"param": "value"}
300
+ mock_response.content = [thinking_block, text_block]
301
+ mock_response.usage = MagicMock(input_tokens=10, output_tokens=30)
207
302
 
208
- mock_response.content = [text_block, tool_block]
209
- mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
210
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
303
+ mock_stream = MockStreamContextManager(mock_response)
304
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
211
305
 
212
306
  messages = [
213
307
  cast(
214
308
  "BetaMessageParam",
215
- {"role": "user", "content": [{"type": "text", "text": "Hi"}]},
309
+ {"role": "user", "content": [{"type": "text", "text": "Hard question"}]},
216
310
  )
217
311
  ]
218
312
  response = await agent.get_response(messages)
219
313
 
220
- assert response.content == "Hello!"
221
- assert len(response.tool_calls) == 1
222
- assert response.tool_calls[0].name == "test_tool"
223
- assert response.tool_calls[0].arguments == {"param": "value"}
224
- # The test was checking for Claude-specific attributes that aren't part of ModelResponse
225
- # These would need to be accessed from the original Claude response if needed
314
+ assert response.content == "Here is the answer"
315
+ assert response.reasoning == "Let me analyze this problem..."
316
+
317
+ @pytest.mark.asyncio
318
+ async def test_convert_tools_for_claude(self, mock_anthropic: AsyncAnthropic) -> None:
319
+ """Test converting MCP tools to Claude format."""
320
+ tools = [
321
+ types.Tool(
322
+ name="my_tool",
323
+ description="A test tool",
324
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
325
+ )
326
+ ]
327
+ ctx = MockEvalContext(tools=tools)
328
+ agent = ClaudeAgent.create(
329
+ model_client=mock_anthropic,
330
+ validate_api_key=False,
331
+ )
332
+
333
+ agent.ctx = ctx
334
+ await agent._initialize_from_ctx(ctx)
226
335
 
227
- # Verify API was called correctly
228
- mock_anthropic.beta.messages.create.assert_called_once()
336
+ # Check that tools were converted
337
+ assert len(agent.claude_tools) == 1
338
+ assert agent.claude_tools[0]["name"] == "my_tool" # type: ignore[typeddict-item]
229
339
 
230
340
  @pytest.mark.asyncio
231
- async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
232
- """Test getting text-only response."""
233
- # Disable telemetry for this test to avoid backend configuration issues
341
+ async def test_computer_tool_detection(self, mock_anthropic: AsyncAnthropic) -> None:
342
+ """Test that computer tools are detected for beta API."""
343
+ tools = [
344
+ types.Tool(
345
+ name="computer",
346
+ description="Control computer",
347
+ inputSchema={"type": "object"},
348
+ )
349
+ ]
350
+ ctx = MockEvalContext(tools=tools)
351
+ agent = ClaudeAgent.create(
352
+ model_client=mock_anthropic,
353
+ validate_api_key=False,
354
+ )
355
+
356
+ agent.ctx = ctx
357
+ await agent._initialize_from_ctx(ctx)
358
+
359
+ assert agent.has_computer_tool is True
360
+
361
+ @pytest.mark.asyncio
362
+ async def test_get_response_with_text(self, mock_anthropic: AsyncAnthropic) -> None:
363
+ """Test getting response with text output."""
364
+ # Create mock response
365
+ mock_response = MagicMock()
366
+ mock_response.content = [MagicMock(type="text", text="Hello!")]
367
+
368
+ mock_stream = MockStreamContextManager(mock_response)
369
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
370
+
371
+ agent = ClaudeAgent.create(
372
+ model_client=mock_anthropic,
373
+ validate_api_key=False,
374
+ )
375
+ agent.claude_tools = []
376
+ agent.tool_mapping = {}
377
+ agent.has_computer_tool = False
378
+ agent._initialized = True
379
+
380
+ response = await agent.get_response([])
381
+ assert response.content == "Hello!"
382
+ assert response.done is True
383
+ assert len(response.tool_calls) == 0
384
+
385
+ @pytest.mark.asyncio
386
+ async def test_get_response_with_tool_call(self, mock_anthropic: AsyncAnthropic) -> None:
387
+ """Test getting response with tool call."""
388
+ mock_tool_use = MagicMock()
389
+ mock_tool_use.type = "tool_use"
390
+ mock_tool_use.id = "call_123"
391
+ mock_tool_use.name = "my_tool"
392
+ mock_tool_use.input = {"x": "value"}
393
+
394
+ mock_response = MagicMock()
395
+ mock_response.content = [mock_tool_use]
396
+
397
+ mock_stream = MockStreamContextManager(mock_response)
398
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
399
+
400
+ agent = ClaudeAgent.create(
401
+ model_client=mock_anthropic,
402
+ validate_api_key=False,
403
+ )
404
+ agent.claude_tools = []
405
+ agent.tool_mapping = {"my_tool": "my_tool"}
406
+ agent.has_computer_tool = False
407
+ agent._initialized = True
408
+
409
+ response = await agent.get_response([])
410
+ assert response.done is False
411
+ assert len(response.tool_calls) == 1
412
+ assert response.tool_calls[0].name == "my_tool"
413
+ assert response.tool_calls[0].arguments == {"x": "value"}
414
+
415
+
416
+ class TestClaudeAgentBedrock:
417
+ """Test ClaudeAgent class with Bedrock."""
418
+
419
+ @pytest.fixture
420
+ def bedrock_client(self) -> AsyncAnthropicBedrock:
421
+ """Create a real AsyncAnthropicBedrock client and stub networked methods."""
422
+ client = AsyncAnthropicBedrock(
423
+ aws_access_key="AKIATEST",
424
+ aws_secret_key="secret",
425
+ aws_region="us-east-1",
426
+ )
427
+ # Stub the actual Bedrock call so tests are hermetic.
428
+ client.beta.messages.create = AsyncMock()
429
+ return client
430
+
431
+ @pytest.mark.asyncio
432
+ async def test_init(self, bedrock_client: AsyncAnthropicBedrock) -> None:
433
+ """Test agent initialization."""
434
+ agent = ClaudeAgent.create(
435
+ model_client=bedrock_client,
436
+ model="test-model-arn",
437
+ validate_api_key=False,
438
+ )
439
+
440
+ assert agent.model_name == "Claude"
441
+ assert agent.config.model == "test-model-arn"
442
+ assert agent.anthropic_client == bedrock_client
443
+
444
+ @pytest.mark.asyncio
445
+ async def test_get_response_bedrock_uses_create_not_stream(
446
+ self, bedrock_client: AsyncAnthropicBedrock
447
+ ) -> None:
448
+ """Bedrock path must call messages.create() (Bedrock doesn't support stream())."""
234
449
  with patch("hud.settings.settings.telemetry_enabled", False):
235
- agent = ClaudeAgent(
236
- mcp_client=mock_mcp_client,
237
- model_client=mock_anthropic,
238
- validate_api_key=False, # Skip validation in tests
450
+ agent = ClaudeAgent.create(
451
+ model_client=bedrock_client,
452
+ model="test-model-arn",
453
+ validate_api_key=False,
239
454
  )
240
455
 
456
+ # Enable computer tool to verify betas list includes computer-use in Bedrock mode.
457
+ agent.has_computer_tool = True
458
+
241
459
  mock_response = MagicMock()
242
- # Create text block
243
460
  text_block = MagicMock()
244
461
  text_block.type = "text"
245
- text_block.text = "Just text"
462
+ text_block.text = "Hello from Bedrock"
246
463
  mock_response.content = [text_block]
247
- mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
248
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
464
+
465
+ bedrock_client.beta.messages.create.return_value = mock_response # type: ignore[union-attr]
249
466
 
250
467
  messages = [
251
468
  cast(
@@ -255,95 +472,47 @@ class TestClaudeAgent:
255
472
  ]
256
473
  response = await agent.get_response(messages)
257
474
 
258
- assert response.content == "Just text"
475
+ assert response.content == "Hello from Bedrock"
259
476
  assert response.tool_calls == []
260
477
 
478
+ # Bedrock-specific behavior: uses create() and appends assistant message directly.
479
+ assert not hasattr(bedrock_client.beta.messages, "stream")
480
+ bedrock_client.beta.messages.create.assert_awaited_once() # type: ignore[union-attr]
481
+ assert len(messages) == 2
482
+ assert messages[-1]["role"] == "assistant"
483
+
484
+ # Ensure the Bedrock call shape is stable.
485
+ _, kwargs = bedrock_client.beta.messages.create.call_args # type: ignore[union-attr]
486
+ assert kwargs["model"] == "test-model-arn"
487
+ assert kwargs["tool_choice"] == {"type": "auto", "disable_parallel_tool_use": True}
488
+ assert "fine-grained-tool-streaming-2025-05-14" in kwargs["betas"]
489
+ assert "computer-use-2025-01-24" in kwargs["betas"]
490
+
261
491
  @pytest.mark.asyncio
262
- async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
263
- """Test handling API errors."""
264
- # Disable telemetry for this test to avoid backend configuration issues
492
+ async def test_get_response_bedrock_missing_boto3_raises_value_error(
493
+ self, bedrock_client: AsyncAnthropicBedrock
494
+ ) -> None:
495
+ """If boto3 isn't installed, Bedrock client import path should raise a clear ValueError."""
265
496
  with patch("hud.settings.settings.telemetry_enabled", False):
266
- agent = ClaudeAgent(
267
- mcp_client=mock_mcp_client,
268
- model_client=mock_anthropic,
269
- validate_api_key=False, # Skip validation in tests
270
- )
271
-
272
- # Mock API error
273
- mock_anthropic.beta.messages.create = AsyncMock(
274
- side_effect=BadRequestError(
275
- message="Invalid request",
276
- response=MagicMock(status_code=400),
277
- body={"error": {"message": "Invalid request"}},
278
- )
497
+ agent = ClaudeAgent.create(
498
+ model_client=bedrock_client,
499
+ model="test-model-arn",
500
+ validate_api_key=False,
279
501
  )
280
502
 
503
+ bedrock_client.beta.messages.create.side_effect = ModuleNotFoundError("boto3") # type: ignore[union-attr]
281
504
  messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
282
505
 
283
- with pytest.raises(BadRequestError):
506
+ with pytest.raises(ValueError, match=r"boto3 is required for AWS Bedrock"):
284
507
  await agent.get_response(messages) # type: ignore
285
508
 
286
- # This test is commented out as it's testing complex integration scenarios
287
- # that may have changed in the implementation
288
- # @pytest.mark.asyncio
289
- # async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
290
- # """Test running agent with tool usage."""
291
- # # Disable telemetry for this test to avoid backend configuration issues
292
- # with patch("hud.settings.settings.telemetry_enabled", False):
293
- # agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
294
-
295
- # # Mock tool availability
296
- # agent._available_tools = [
297
- # types.Tool(
298
- # name="calculator", description="Calculator", inputSchema={"type": "object"}
299
- # )
300
- # ]
301
- # agent._tool_map = {
302
- # "calculator": types.Tool(
303
- # name="calculator", description="Calculator", inputSchema={"type": "object"}
304
- # )
305
- # }
306
-
307
- # # Mock initial response with tool use
308
- # initial_response = MagicMock()
309
- # # Create tool use block
310
- # tool_block = MagicMock()
311
- # tool_block.type = "tool_use"
312
- # tool_block.id = "calc_123"
313
- # tool_block.name = "calculator"
314
- # tool_block.input = {"operation": "add", "a": 2, "b": 3}
315
- # initial_response.content = [tool_block]
316
- # initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
317
-
318
- # # Mock follow-up response
319
- # final_response = MagicMock()
320
- # text_block = MagicMock()
321
- # text_block.type = "text"
322
- # text_block.text = "2 + 3 = 5"
323
- # final_response.content = [text_block]
324
- # final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
325
-
326
- # mock_anthropic.beta.messages.create = AsyncMock(
327
- # side_effect=[initial_response, final_response]
328
- # )
329
-
330
- # # Mock tool execution
331
- # mock_mcp_client.call_tool = AsyncMock(
332
- # return_value=MCPToolResult(
333
- # content=[types.TextContent(type="text", text="5")], isError=False
334
- # )
335
- # )
336
-
337
- # # Mock the mcp_client properties
338
- # mock_mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
339
- # mock_mcp_client.list_tools = AsyncMock(return_value=agent._available_tools)
340
- # mock_mcp_client.initialize = AsyncMock()
341
-
342
- # # Initialize the agent
343
- # await agent.initialize()
344
-
345
- # # Use a string prompt instead of a task
346
- # result = await agent.run("What is 2 + 3?")
347
-
348
- # assert result.content == "2 + 3 = 5"
349
- # assert result.done is True
509
+ def test_init_with_bedrock_client_does_not_require_anthropic_api_key(
510
+ self, bedrock_client: AsyncAnthropicBedrock
511
+ ) -> None:
512
+ """Providing model_client should bypass ANTHROPIC_API_KEY validation."""
513
+ with patch("hud.settings.settings.anthropic_api_key", None):
514
+ agent = ClaudeAgent.create(
515
+ model_client=bedrock_client,
516
+ validate_api_key=False,
517
+ )
518
+ assert agent.anthropic_client == bedrock_client