hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -2,11 +2,11 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from typing import TYPE_CHECKING, cast
5
+ from typing import TYPE_CHECKING, Any, cast
6
6
  from unittest.mock import AsyncMock, MagicMock, patch
7
7
 
8
8
  import pytest
9
- from anthropic import BadRequestError
9
+ from anthropic import AsyncAnthropic, AsyncAnthropicBedrock
10
10
  from mcp import types
11
11
 
12
12
  from hud.agents.claude import (
@@ -15,18 +15,96 @@ from hud.agents.claude import (
15
15
  text_to_content_block,
16
16
  tool_use_content_block,
17
17
  )
18
+ from hud.environment.router import ToolRouter
19
+ from hud.eval.context import EvalContext
18
20
  from hud.types import MCPToolCall, MCPToolResult
19
21
 
20
22
  if TYPE_CHECKING:
21
- from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
23
+ from collections.abc import Generator
24
+
25
+ from anthropic.types.beta import BetaMessageParam
26
+
27
+
28
+ class MockEvalContext(EvalContext):
29
+ """Mock EvalContext for testing."""
30
+
31
+ def __init__(self, tools: list[types.Tool] | None = None) -> None:
32
+ # Core attributes
33
+ self.prompt = "Test prompt"
34
+ self._tools = tools or []
35
+ self._submitted: str | None = None
36
+ self.reward: float | None = None
37
+
38
+ # Environment attributes
39
+ self._router = ToolRouter()
40
+ self._agent_include: list[str] | None = None
41
+ self._agent_exclude: list[str] | None = None
42
+
43
+ # EvalContext attributes
44
+ self._task = None
45
+ self.trace_id = "test-trace-id"
46
+ self.eval_name = "test-eval"
47
+ self.job_id: str | None = None
48
+ self.group_id: str | None = None
49
+ self.index = 0
50
+ self.variants: dict[str, Any] = {}
51
+ self.answer: str | None = None
52
+ self.system_prompt: str | None = None
53
+ self.error: BaseException | None = None
54
+ self.metadata: dict[str, Any] = {}
55
+ self.results: list[Any] = []
56
+ self._is_summary = False
57
+
58
+ def as_tools(self) -> list[types.Tool]:
59
+ return self._tools
60
+
61
+ @property
62
+ def has_scenario(self) -> bool:
63
+ return False
64
+
65
+ async def list_tools(self) -> list[types.Tool]:
66
+ return self._tools
67
+
68
+ async def call_tool(self, call: Any, /, **kwargs: Any) -> MCPToolResult:
69
+ return MCPToolResult(
70
+ content=[types.TextContent(type="text", text="ok")],
71
+ isError=False,
72
+ )
73
+
74
+ async def submit(self, answer: str) -> None:
75
+ self._submitted = answer
76
+
77
+
78
+ class MockStreamContextManager:
79
+ """Mock for Claude's streaming context manager."""
80
+
81
+ def __init__(self, response: MagicMock) -> None:
82
+ self.response = response
83
+
84
+ async def __aenter__(self) -> MockStreamContextManager:
85
+ return self
86
+
87
+ async def __aexit__(
88
+ self, exc_type: type | None, exc_val: Exception | None, exc_tb: Any
89
+ ) -> bool:
90
+ return False
91
+
92
+ def __aiter__(self) -> MockStreamContextManager:
93
+ return self
94
+
95
+ async def __anext__(self) -> None:
96
+ raise StopAsyncIteration
97
+
98
+ async def get_final_message(self) -> MagicMock:
99
+ return self.response
22
100
 
23
101
 
24
102
  class TestClaudeHelperFunctions:
25
103
  """Test helper functions for Claude message formatting."""
26
104
 
27
- def test_base64_to_content_block(self):
105
+ def test_base64_to_content_block(self) -> None:
28
106
  """Test base64 image conversion."""
29
- base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==" # noqa: E501
107
+ base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk"
30
108
  result = base64_to_content_block(base64_data)
31
109
 
32
110
  assert result["type"] == "image"
@@ -34,7 +112,7 @@ class TestClaudeHelperFunctions:
34
112
  assert result["source"]["media_type"] == "image/png"
35
113
  assert result["source"]["data"] == base64_data
36
114
 
37
- def test_text_to_content_block(self):
115
+ def test_text_to_content_block(self) -> None:
38
116
  """Test text conversion."""
39
117
  text = "Hello, world!"
40
118
  result = text_to_content_block(text)
@@ -42,12 +120,10 @@ class TestClaudeHelperFunctions:
42
120
  assert result["type"] == "text"
43
121
  assert result["text"] == text
44
122
 
45
- def test_tool_use_content_block(self):
123
+ def test_tool_use_content_block(self) -> None:
46
124
  """Test tool result content block creation."""
47
125
  tool_use_id = "tool_123"
48
- content: list[BetaTextBlockParam | BetaImageBlockParam] = [
49
- text_to_content_block("Result text")
50
- ]
126
+ content = [text_to_content_block("Result text")]
51
127
 
52
128
  result = tool_use_content_block(tool_use_id, content)
53
129
 
@@ -60,192 +136,331 @@ class TestClaudeAgent:
60
136
  """Test ClaudeAgent class."""
61
137
 
62
138
  @pytest.fixture
63
- def mock_mcp_client(self):
64
- """Create a mock MCP client."""
65
- mcp_client = MagicMock()
66
- return mcp_client
67
-
68
- @pytest.fixture
69
- def mock_anthropic(self):
70
- """Create a mock Anthropic client."""
71
- with patch("hud.agents.claude.AsyncAnthropic") as mock:
72
- client = AsyncMock()
73
- # Add beta attribute with messages
74
- client.beta = AsyncMock()
75
- client.beta.messages = AsyncMock()
76
- mock.return_value = client
77
- yield client
139
+ def mock_anthropic(self) -> Generator[AsyncAnthropic, None, None]: # type: ignore[misc]
140
+ """Create a stub Anthropic client."""
141
+ with patch("hud.agents.claude.AsyncAnthropic") as mock_class:
142
+ client = MagicMock(spec=AsyncAnthropic)
143
+ client.api_key = "test-key"
144
+ mock_class.return_value = client
145
+ yield client # type: ignore[misc]
78
146
 
79
147
  @pytest.mark.asyncio
80
- async def test_init(self, mock_mcp_client, mock_anthropic):
81
- """Test agent initialization."""
82
- # Test with provided model_client
83
- mock_model_client = MagicMock()
84
- agent = ClaudeAgent(
85
- mcp_client=mock_mcp_client,
86
- model_client=mock_model_client,
87
- model="claude-3-opus-20240229",
88
- max_tokens=1000,
89
- validate_api_key=False, # Skip validation in tests
148
+ async def test_init_with_client(self, mock_anthropic: AsyncAnthropic) -> None:
149
+ """Test agent initialization with provided client."""
150
+ agent = ClaudeAgent.create(
151
+ model_client=mock_anthropic,
152
+ model="claude-sonnet-4-20250514",
153
+ validate_api_key=False,
90
154
  )
91
155
 
92
- assert agent.model_name == "claude-3-opus-20240229"
93
- assert agent.max_tokens == 1000
94
- assert agent.anthropic_client == mock_model_client
156
+ assert agent.model_name == "Claude"
157
+ assert agent.config.model == "claude-sonnet-4-20250514"
158
+ assert agent.anthropic_client == mock_anthropic
95
159
 
96
160
  @pytest.mark.asyncio
97
- async def test_init_without_model_client(self, mock_mcp_client, mock_anthropic):
98
- """Test agent initialization without model client."""
99
- with patch("hud.settings.settings.anthropic_api_key", "test_key"):
100
- agent = ClaudeAgent(
101
- mcp_client=mock_mcp_client,
102
- model="claude-3-opus-20240229",
103
- validate_api_key=False, # Skip validation in tests
104
- )
161
+ async def test_init_with_parameters(self, mock_anthropic: AsyncAnthropic) -> None:
162
+ """Test agent initialization with various parameters."""
163
+ agent = ClaudeAgent.create(
164
+ model_client=mock_anthropic,
165
+ model="claude-sonnet-4-20250514",
166
+ max_tokens=4096,
167
+ validate_api_key=False,
168
+ )
105
169
 
106
- assert agent.model_name == "claude-3-opus-20240229"
107
- assert agent.anthropic_client is not None
170
+ assert agent.max_tokens == 4096
108
171
 
109
172
  @pytest.mark.asyncio
110
- async def test_format_blocks(self, mock_mcp_client):
111
- """Test formatting content blocks into Claude messages."""
112
- mock_model_client = MagicMock()
113
- agent = ClaudeAgent(
114
- mcp_client=mock_mcp_client,
115
- model_client=mock_model_client,
116
- validate_api_key=False, # Skip validation in tests
173
+ async def test_format_blocks_text_only(self, mock_anthropic: AsyncAnthropic) -> None:
174
+ """Test formatting text content blocks."""
175
+ agent = ClaudeAgent.create(
176
+ model_client=mock_anthropic,
177
+ validate_api_key=False,
117
178
  )
118
179
 
119
- # Test with text only
120
- text_blocks: list[types.ContentBlock] = [
121
- types.TextContent(type="text", text="Hello, Claude!")
180
+ blocks: list[types.ContentBlock] = [
181
+ types.TextContent(type="text", text="Hello, world!"),
182
+ types.TextContent(type="text", text="How are you?"),
122
183
  ]
123
- messages = await agent.format_blocks(text_blocks)
184
+
185
+ messages = await agent.format_blocks(blocks)
124
186
  assert len(messages) == 1
125
187
  assert messages[0]["role"] == "user"
126
188
  content = messages[0]["content"]
127
189
  assert isinstance(content, list)
128
- assert len(content) == 1
129
- assert content[0]["type"] == "text"
130
- assert content[0]["text"] == "Hello, Claude!"
190
+ assert len(content) == 2
191
+ assert content[0]["type"] == "text" # type: ignore[index]
192
+ assert content[0]["text"] == "Hello, world!" # type: ignore[index]
193
+
194
+ @pytest.mark.asyncio
195
+ async def test_format_blocks_with_image(self, mock_anthropic: AsyncAnthropic) -> None:
196
+ """Test formatting image content blocks."""
197
+ agent = ClaudeAgent.create(
198
+ model_client=mock_anthropic,
199
+ validate_api_key=False,
200
+ )
131
201
 
132
- # Test with screenshot
133
- image_blocks: list[types.ContentBlock] = [
134
- types.TextContent(type="text", text="Look at this"),
202
+ blocks: list[types.ContentBlock] = [
203
+ types.TextContent(type="text", text="Look at this:"),
135
204
  types.ImageContent(type="image", data="base64data", mimeType="image/png"),
136
205
  ]
137
- messages = await agent.format_blocks(image_blocks)
206
+
207
+ messages = await agent.format_blocks(blocks)
138
208
  assert len(messages) == 1
139
- assert messages[0]["role"] == "user"
140
209
  content = messages[0]["content"]
141
210
  assert isinstance(content, list)
142
211
  assert len(content) == 2
143
- # Content blocks are in order
144
- assert content[0]["type"] == "text"
145
- assert content[0]["text"] == "Look at this"
146
- assert content[1]["type"] == "image"
147
- assert content[1]["source"]["data"] == "base64data"
212
+ assert content[1]["type"] == "image" # type: ignore[index]
148
213
 
149
214
  @pytest.mark.asyncio
150
- async def test_format_tool_results_method(self, mock_mcp_client):
151
- """Test the agent's format_tool_results method."""
152
- mock_model_client = MagicMock()
153
- agent = ClaudeAgent(
154
- mcp_client=mock_mcp_client,
155
- model_client=mock_model_client,
156
- validate_api_key=False, # Skip validation in tests
215
+ async def test_format_tool_results_text(self, mock_anthropic: AsyncAnthropic) -> None:
216
+ """Test formatting tool results with text content."""
217
+ agent = ClaudeAgent.create(
218
+ model_client=mock_anthropic,
219
+ validate_api_key=False,
157
220
  )
158
221
 
159
- tool_calls = [
160
- MCPToolCall(name="test_tool", arguments={}, id="id1"),
161
- ]
162
-
222
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
163
223
  tool_results = [
164
- MCPToolResult(content=[types.TextContent(type="text", text="Success")], isError=False),
224
+ MCPToolResult(
225
+ content=[types.TextContent(type="text", text="Tool output")],
226
+ isError=False,
227
+ )
165
228
  ]
166
229
 
167
230
  messages = await agent.format_tool_results(tool_calls, tool_results)
168
-
169
- # format_tool_results returns a single user message with tool result content
170
231
  assert len(messages) == 1
171
232
  assert messages[0]["role"] == "user"
172
- # The content is wrapped in a tool result block
173
- content = list(messages[0]["content"])
233
+ content = messages[0]["content"]
234
+ assert isinstance(content, list)
174
235
  assert len(content) == 1
175
- assert content[0]["type"] == "tool_result" # type: ignore
176
- assert content[0]["tool_use_id"] == "id1" # type: ignore
177
- # The actual content is nested inside
178
- inner_content = list(content[0]["content"]) # type: ignore
179
- assert inner_content[0]["type"] == "text" # type: ignore
180
- assert inner_content[0]["text"] == "Success" # type: ignore
236
+ assert content[0]["type"] == "tool_result" # type: ignore[index]
237
+ assert content[0]["tool_use_id"] == "call_123" # type: ignore[index]
238
+
239
+ @pytest.mark.asyncio
240
+ async def test_format_tool_results_with_error(self, mock_anthropic: AsyncAnthropic) -> None:
241
+ """Test formatting tool results with error."""
242
+ agent = ClaudeAgent.create(
243
+ model_client=mock_anthropic,
244
+ validate_api_key=False,
245
+ )
246
+
247
+ tool_calls = [MCPToolCall(id="call_123", name="test_tool", arguments={})]
248
+ tool_results = [
249
+ MCPToolResult(
250
+ content=[types.TextContent(type="text", text="Error message")],
251
+ isError=True,
252
+ )
253
+ ]
254
+
255
+ messages = await agent.format_tool_results(tool_calls, tool_results)
256
+ assert len(messages) == 1
257
+ content = messages[0]["content"]
258
+ # Error content should include "Error:" prefix
259
+ assert any("Error" in str(block) for block in content[0]["content"]) # type: ignore[index]
260
+
261
+ @pytest.mark.asyncio
262
+ async def test_get_system_messages(self, mock_anthropic: AsyncAnthropic) -> None:
263
+ """Test that system messages return empty (Claude uses system param)."""
264
+ agent = ClaudeAgent.create(
265
+ model_client=mock_anthropic,
266
+ system_prompt="You are a helpful assistant.",
267
+ validate_api_key=False,
268
+ )
269
+
270
+ messages = await agent.get_system_messages()
271
+ # Claude doesn't use system messages in the message list
272
+ assert messages == []
181
273
 
182
274
  @pytest.mark.asyncio
183
- async def test_get_response(self, mock_mcp_client, mock_anthropic):
184
- """Test getting model response from Claude API."""
185
- # Disable telemetry for this test to avoid backend configuration issues
275
+ async def test_get_response_with_thinking(self, mock_anthropic: AsyncAnthropic) -> None:
276
+ """Test getting model response with thinking content."""
186
277
  with patch("hud.settings.settings.telemetry_enabled", False):
187
- agent = ClaudeAgent(
188
- mcp_client=mock_mcp_client,
278
+ agent = ClaudeAgent.create(
189
279
  model_client=mock_anthropic,
190
- validate_api_key=False, # Skip validation in tests
280
+ validate_api_key=False,
191
281
  )
282
+ # Set up agent as initialized
283
+ agent.claude_tools = []
284
+ agent.tool_mapping = {}
285
+ agent.has_computer_tool = False
286
+ agent._initialized = True
192
287
 
193
- # Mock the API response
194
288
  mock_response = MagicMock()
195
289
 
196
- # Create text block
290
+ thinking_block = MagicMock()
291
+ thinking_block.type = "thinking"
292
+ thinking_block.thinking = "Let me analyze this problem..."
293
+
197
294
  text_block = MagicMock()
198
295
  text_block.type = "text"
199
- text_block.text = "Hello!"
296
+ text_block.text = "Here is the answer"
200
297
 
201
- # Create tool use block
202
- tool_block = MagicMock()
203
- tool_block.type = "tool_use"
204
- tool_block.id = "tool_123"
205
- tool_block.name = "test_tool"
206
- tool_block.input = {"param": "value"}
298
+ mock_response.content = [thinking_block, text_block]
299
+ mock_response.usage = MagicMock(input_tokens=10, output_tokens=30)
207
300
 
208
- mock_response.content = [text_block, tool_block]
209
- mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
210
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
301
+ mock_stream = MockStreamContextManager(mock_response)
302
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
211
303
 
212
304
  messages = [
213
305
  cast(
214
306
  "BetaMessageParam",
215
- {"role": "user", "content": [{"type": "text", "text": "Hi"}]},
307
+ {"role": "user", "content": [{"type": "text", "text": "Hard question"}]},
216
308
  )
217
309
  ]
218
310
  response = await agent.get_response(messages)
219
311
 
220
- assert response.content == "Hello!"
221
- assert len(response.tool_calls) == 1
222
- assert response.tool_calls[0].name == "test_tool"
223
- assert response.tool_calls[0].arguments == {"param": "value"}
224
- # The test was checking for Claude-specific attributes that aren't part of ModelResponse
225
- # These would need to be accessed from the original Claude response if needed
312
+ assert response.content == "Here is the answer"
313
+ assert response.reasoning == "Let me analyze this problem..."
314
+
315
+ @pytest.mark.asyncio
316
+ async def test_convert_tools_for_claude(self, mock_anthropic: AsyncAnthropic) -> None:
317
+ """Test converting MCP tools to Claude format."""
318
+ tools = [
319
+ types.Tool(
320
+ name="my_tool",
321
+ description="A test tool",
322
+ inputSchema={"type": "object", "properties": {"x": {"type": "string"}}},
323
+ )
324
+ ]
325
+ ctx = MockEvalContext(tools=tools)
326
+ agent = ClaudeAgent.create(
327
+ model_client=mock_anthropic,
328
+ validate_api_key=False,
329
+ )
330
+
331
+ agent.ctx = ctx
332
+ await agent._initialize_from_ctx(ctx)
333
+
334
+ # Check that tools were converted
335
+ assert len(agent.claude_tools) == 1
336
+ assert agent.claude_tools[0]["name"] == "my_tool" # type: ignore[typeddict-item]
337
+
338
+ @pytest.mark.asyncio
339
+ async def test_computer_tool_detection(self, mock_anthropic: AsyncAnthropic) -> None:
340
+ """Test that computer tools are detected for beta API."""
341
+ tools = [
342
+ types.Tool(
343
+ name="computer",
344
+ description="Control computer",
345
+ inputSchema={"type": "object"},
346
+ )
347
+ ]
348
+ ctx = MockEvalContext(tools=tools)
349
+ agent = ClaudeAgent.create(
350
+ model_client=mock_anthropic,
351
+ validate_api_key=False,
352
+ )
353
+
354
+ agent.ctx = ctx
355
+ await agent._initialize_from_ctx(ctx)
356
+
357
+ assert agent.has_computer_tool is True
358
+
359
+ @pytest.mark.asyncio
360
+ async def test_get_response_with_text(self, mock_anthropic: AsyncAnthropic) -> None:
361
+ """Test getting response with text output."""
362
+ # Create mock response
363
+ mock_response = MagicMock()
364
+ mock_response.content = [MagicMock(type="text", text="Hello!")]
365
+
366
+ mock_stream = MockStreamContextManager(mock_response)
367
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
368
+
369
+ agent = ClaudeAgent.create(
370
+ model_client=mock_anthropic,
371
+ validate_api_key=False,
372
+ )
373
+ agent.claude_tools = []
374
+ agent.tool_mapping = {}
375
+ agent.has_computer_tool = False
376
+ agent._initialized = True
377
+
378
+ response = await agent.get_response([])
379
+ assert response.content == "Hello!"
380
+ assert response.done is True
381
+ assert len(response.tool_calls) == 0
382
+
383
+ @pytest.mark.asyncio
384
+ async def test_get_response_with_tool_call(self, mock_anthropic: AsyncAnthropic) -> None:
385
+ """Test getting response with tool call."""
386
+ mock_tool_use = MagicMock()
387
+ mock_tool_use.type = "tool_use"
388
+ mock_tool_use.id = "call_123"
389
+ mock_tool_use.name = "my_tool"
390
+ mock_tool_use.input = {"x": "value"}
391
+
392
+ mock_response = MagicMock()
393
+ mock_response.content = [mock_tool_use]
394
+
395
+ mock_stream = MockStreamContextManager(mock_response)
396
+ mock_anthropic.beta.messages.stream = MagicMock(return_value=mock_stream)
397
+
398
+ agent = ClaudeAgent.create(
399
+ model_client=mock_anthropic,
400
+ validate_api_key=False,
401
+ )
402
+ agent.claude_tools = []
403
+ agent.tool_mapping = {"my_tool": "my_tool"}
404
+ agent.has_computer_tool = False
405
+ agent._initialized = True
406
+
407
+ response = await agent.get_response([])
408
+ assert response.done is False
409
+ assert len(response.tool_calls) == 1
410
+ assert response.tool_calls[0].name == "my_tool"
411
+ assert response.tool_calls[0].arguments == {"x": "value"}
412
+
413
+
414
+ class TestClaudeAgentBedrock:
415
+ """Test ClaudeAgent class with Bedrock."""
416
+
417
+ @pytest.fixture
418
+ def bedrock_client(self) -> AsyncAnthropicBedrock:
419
+ """Create a real AsyncAnthropicBedrock client and stub networked methods."""
420
+ client = AsyncAnthropicBedrock(
421
+ aws_access_key="AKIATEST",
422
+ aws_secret_key="secret",
423
+ aws_region="us-east-1",
424
+ )
425
+ # Stub the actual Bedrock call so tests are hermetic.
426
+ client.beta.messages.create = AsyncMock()
427
+ return client
428
+
429
+ @pytest.mark.asyncio
430
+ async def test_init(self, bedrock_client: AsyncAnthropicBedrock) -> None:
431
+ """Test agent initialization."""
432
+ agent = ClaudeAgent.create(
433
+ model_client=bedrock_client,
434
+ model="test-model-arn",
435
+ validate_api_key=False,
436
+ )
226
437
 
227
- # Verify API was called correctly
228
- mock_anthropic.beta.messages.create.assert_called_once()
438
+ assert agent.model_name == "Claude"
439
+ assert agent.config.model == "test-model-arn"
440
+ assert agent.anthropic_client == bedrock_client
229
441
 
230
442
  @pytest.mark.asyncio
231
- async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
232
- """Test getting text-only response."""
233
- # Disable telemetry for this test to avoid backend configuration issues
443
+ async def test_get_response_bedrock_uses_create_not_stream(
444
+ self, bedrock_client: AsyncAnthropicBedrock
445
+ ) -> None:
446
+ """Bedrock path must call messages.create() (Bedrock doesn't support stream())."""
234
447
  with patch("hud.settings.settings.telemetry_enabled", False):
235
- agent = ClaudeAgent(
236
- mcp_client=mock_mcp_client,
237
- model_client=mock_anthropic,
238
- validate_api_key=False, # Skip validation in tests
448
+ agent = ClaudeAgent.create(
449
+ model_client=bedrock_client,
450
+ model="test-model-arn",
451
+ validate_api_key=False,
239
452
  )
240
453
 
454
+ # Enable computer tool to verify betas list includes computer-use in Bedrock mode.
455
+ agent.has_computer_tool = True
456
+
241
457
  mock_response = MagicMock()
242
- # Create text block
243
458
  text_block = MagicMock()
244
459
  text_block.type = "text"
245
- text_block.text = "Just text"
460
+ text_block.text = "Hello from Bedrock"
246
461
  mock_response.content = [text_block]
247
- mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
248
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
462
+
463
+ bedrock_client.beta.messages.create.return_value = mock_response # type: ignore[union-attr]
249
464
 
250
465
  messages = [
251
466
  cast(
@@ -255,95 +470,47 @@ class TestClaudeAgent:
255
470
  ]
256
471
  response = await agent.get_response(messages)
257
472
 
258
- assert response.content == "Just text"
473
+ assert response.content == "Hello from Bedrock"
259
474
  assert response.tool_calls == []
260
475
 
476
+ # Bedrock-specific behavior: uses create() and appends assistant message directly.
477
+ assert not hasattr(bedrock_client.beta.messages, "stream")
478
+ bedrock_client.beta.messages.create.assert_awaited_once() # type: ignore[union-attr]
479
+ assert len(messages) == 2
480
+ assert messages[-1]["role"] == "assistant"
481
+
482
+ # Ensure the Bedrock call shape is stable.
483
+ _, kwargs = bedrock_client.beta.messages.create.call_args # type: ignore[union-attr]
484
+ assert kwargs["model"] == "test-model-arn"
485
+ assert kwargs["tool_choice"] == {"type": "auto", "disable_parallel_tool_use": True}
486
+ assert "fine-grained-tool-streaming-2025-05-14" in kwargs["betas"]
487
+ assert "computer-use-2025-01-24" in kwargs["betas"]
488
+
261
489
  @pytest.mark.asyncio
262
- async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
263
- """Test handling API errors."""
264
- # Disable telemetry for this test to avoid backend configuration issues
490
+ async def test_get_response_bedrock_missing_boto3_raises_value_error(
491
+ self, bedrock_client: AsyncAnthropicBedrock
492
+ ) -> None:
493
+ """If boto3 isn't installed, Bedrock client import path should raise a clear ValueError."""
265
494
  with patch("hud.settings.settings.telemetry_enabled", False):
266
- agent = ClaudeAgent(
267
- mcp_client=mock_mcp_client,
268
- model_client=mock_anthropic,
269
- validate_api_key=False, # Skip validation in tests
270
- )
271
-
272
- # Mock API error
273
- mock_anthropic.beta.messages.create = AsyncMock(
274
- side_effect=BadRequestError(
275
- message="Invalid request",
276
- response=MagicMock(status_code=400),
277
- body={"error": {"message": "Invalid request"}},
278
- )
495
+ agent = ClaudeAgent.create(
496
+ model_client=bedrock_client,
497
+ model="test-model-arn",
498
+ validate_api_key=False,
279
499
  )
280
500
 
501
+ bedrock_client.beta.messages.create.side_effect = ModuleNotFoundError("boto3") # type: ignore[union-attr]
281
502
  messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
282
503
 
283
- with pytest.raises(BadRequestError):
504
+ with pytest.raises(ValueError, match=r"boto3 is required for AWS Bedrock"):
284
505
  await agent.get_response(messages) # type: ignore
285
506
 
286
- # This test is commented out as it's testing complex integration scenarios
287
- # that may have changed in the implementation
288
- # @pytest.mark.asyncio
289
- # async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
290
- # """Test running agent with tool usage."""
291
- # # Disable telemetry for this test to avoid backend configuration issues
292
- # with patch("hud.settings.settings.telemetry_enabled", False):
293
- # agent = ClaudeAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
294
-
295
- # # Mock tool availability
296
- # agent._available_tools = [
297
- # types.Tool(
298
- # name="calculator", description="Calculator", inputSchema={"type": "object"}
299
- # )
300
- # ]
301
- # agent._tool_map = {
302
- # "calculator": types.Tool(
303
- # name="calculator", description="Calculator", inputSchema={"type": "object"}
304
- # )
305
- # }
306
-
307
- # # Mock initial response with tool use
308
- # initial_response = MagicMock()
309
- # # Create tool use block
310
- # tool_block = MagicMock()
311
- # tool_block.type = "tool_use"
312
- # tool_block.id = "calc_123"
313
- # tool_block.name = "calculator"
314
- # tool_block.input = {"operation": "add", "a": 2, "b": 3}
315
- # initial_response.content = [tool_block]
316
- # initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
317
-
318
- # # Mock follow-up response
319
- # final_response = MagicMock()
320
- # text_block = MagicMock()
321
- # text_block.type = "text"
322
- # text_block.text = "2 + 3 = 5"
323
- # final_response.content = [text_block]
324
- # final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
325
-
326
- # mock_anthropic.beta.messages.create = AsyncMock(
327
- # side_effect=[initial_response, final_response]
328
- # )
329
-
330
- # # Mock tool execution
331
- # mock_mcp_client.call_tool = AsyncMock(
332
- # return_value=MCPToolResult(
333
- # content=[types.TextContent(type="text", text="5")], isError=False
334
- # )
335
- # )
336
-
337
- # # Mock the mcp_client properties
338
- # mock_mcp_client.mcp_config = {"test_server": {"url": "http://localhost"}}
339
- # mock_mcp_client.list_tools = AsyncMock(return_value=agent._available_tools)
340
- # mock_mcp_client.initialize = AsyncMock()
341
-
342
- # # Initialize the agent
343
- # await agent.initialize()
344
-
345
- # # Use a string prompt instead of a task
346
- # result = await agent.run("What is 2 + 3?")
347
-
348
- # assert result.content == "2 + 3 = 5"
349
- # assert result.done is True
507
+ def test_init_with_bedrock_client_does_not_require_anthropic_api_key(
508
+ self, bedrock_client: AsyncAnthropicBedrock
509
+ ) -> None:
510
+ """Providing model_client should bypass ANTHROPIC_API_KEY validation."""
511
+ with patch("hud.settings.settings.anthropic_api_key", None):
512
+ agent = ClaudeAgent.create(
513
+ model_client=bedrock_client,
514
+ validate_api_key=False,
515
+ )
516
+ assert agent.anthropic_client == bedrock_client