hud-python 0.3.5__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/__init__.py +22 -89
- hud/agents/__init__.py +17 -0
- hud/agents/art.py +101 -0
- hud/agents/base.py +599 -0
- hud/{mcp → agents}/claude.py +373 -321
- hud/{mcp → agents}/langchain.py +250 -250
- hud/agents/misc/__init__.py +7 -0
- hud/{agent → agents}/misc/response_agent.py +80 -80
- hud/{mcp → agents}/openai.py +352 -334
- hud/agents/openai_chat_generic.py +154 -0
- hud/{mcp → agents}/tests/__init__.py +1 -1
- hud/agents/tests/test_base.py +742 -0
- hud/agents/tests/test_claude.py +324 -0
- hud/{mcp → agents}/tests/test_client.py +363 -324
- hud/{mcp → agents}/tests/test_openai.py +237 -238
- hud/cli/__init__.py +617 -0
- hud/cli/__main__.py +8 -0
- hud/cli/analyze.py +371 -0
- hud/cli/analyze_metadata.py +230 -0
- hud/cli/build.py +427 -0
- hud/cli/clone.py +185 -0
- hud/cli/cursor.py +92 -0
- hud/cli/debug.py +392 -0
- hud/cli/docker_utils.py +83 -0
- hud/cli/init.py +281 -0
- hud/cli/interactive.py +353 -0
- hud/cli/mcp_server.py +756 -0
- hud/cli/pull.py +336 -0
- hud/cli/push.py +379 -0
- hud/cli/remote_runner.py +311 -0
- hud/cli/runner.py +160 -0
- hud/cli/tests/__init__.py +3 -0
- hud/cli/tests/test_analyze.py +284 -0
- hud/cli/tests/test_cli_init.py +265 -0
- hud/cli/tests/test_cli_main.py +27 -0
- hud/cli/tests/test_clone.py +142 -0
- hud/cli/tests/test_cursor.py +253 -0
- hud/cli/tests/test_debug.py +453 -0
- hud/cli/tests/test_mcp_server.py +139 -0
- hud/cli/tests/test_utils.py +388 -0
- hud/cli/utils.py +263 -0
- hud/clients/README.md +143 -0
- hud/clients/__init__.py +16 -0
- hud/clients/base.py +354 -0
- hud/clients/fastmcp.py +202 -0
- hud/clients/mcp_use.py +278 -0
- hud/clients/tests/__init__.py +1 -0
- hud/clients/tests/test_client_integration.py +111 -0
- hud/clients/tests/test_fastmcp.py +342 -0
- hud/clients/tests/test_protocol.py +188 -0
- hud/clients/utils/__init__.py +1 -0
- hud/clients/utils/retry_transport.py +160 -0
- hud/datasets.py +322 -192
- hud/misc/__init__.py +1 -0
- hud/{agent → misc}/claude_plays_pokemon.py +292 -283
- hud/otel/__init__.py +35 -0
- hud/otel/collector.py +142 -0
- hud/otel/config.py +164 -0
- hud/otel/context.py +536 -0
- hud/otel/exporters.py +366 -0
- hud/otel/instrumentation.py +97 -0
- hud/otel/processors.py +118 -0
- hud/otel/tests/__init__.py +1 -0
- hud/otel/tests/test_processors.py +197 -0
- hud/server/__init__.py +5 -5
- hud/server/context.py +114 -0
- hud/server/helper/__init__.py +5 -0
- hud/server/low_level.py +132 -0
- hud/server/server.py +166 -0
- hud/server/tests/__init__.py +3 -0
- hud/settings.py +73 -79
- hud/shared/__init__.py +5 -0
- hud/{exceptions.py → shared/exceptions.py} +180 -180
- hud/{server → shared}/requests.py +264 -264
- hud/shared/tests/test_exceptions.py +157 -0
- hud/{server → shared}/tests/test_requests.py +275 -275
- hud/telemetry/__init__.py +25 -30
- hud/telemetry/instrument.py +379 -0
- hud/telemetry/job.py +309 -141
- hud/telemetry/replay.py +74 -0
- hud/telemetry/trace.py +83 -0
- hud/tools/__init__.py +33 -34
- hud/tools/base.py +365 -65
- hud/tools/bash.py +161 -137
- hud/tools/computer/__init__.py +15 -13
- hud/tools/computer/anthropic.py +437 -420
- hud/tools/computer/hud.py +376 -334
- hud/tools/computer/openai.py +295 -292
- hud/tools/computer/settings.py +82 -0
- hud/tools/edit.py +314 -290
- hud/tools/executors/__init__.py +30 -30
- hud/tools/executors/base.py +539 -532
- hud/tools/executors/pyautogui.py +621 -619
- hud/tools/executors/tests/__init__.py +1 -1
- hud/tools/executors/tests/test_base_executor.py +338 -338
- hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
- hud/tools/executors/xdo.py +511 -503
- hud/tools/{playwright_tool.py → playwright.py} +412 -379
- hud/tools/tests/__init__.py +3 -3
- hud/tools/tests/test_base.py +282 -0
- hud/tools/tests/test_bash.py +158 -152
- hud/tools/tests/test_bash_extended.py +197 -0
- hud/tools/tests/test_computer.py +425 -52
- hud/tools/tests/test_computer_actions.py +34 -34
- hud/tools/tests/test_edit.py +259 -240
- hud/tools/tests/test_init.py +27 -27
- hud/tools/tests/test_playwright_tool.py +183 -183
- hud/tools/tests/test_tools.py +145 -157
- hud/tools/tests/test_utils.py +156 -156
- hud/tools/types.py +72 -0
- hud/tools/utils.py +50 -50
- hud/types.py +136 -89
- hud/utils/__init__.py +10 -16
- hud/utils/async_utils.py +65 -0
- hud/utils/design.py +168 -0
- hud/utils/mcp.py +55 -0
- hud/utils/progress.py +149 -149
- hud/utils/telemetry.py +66 -66
- hud/utils/tests/test_async_utils.py +173 -0
- hud/utils/tests/test_init.py +17 -21
- hud/utils/tests/test_progress.py +261 -225
- hud/utils/tests/test_telemetry.py +82 -37
- hud/utils/tests/test_version.py +8 -8
- hud/version.py +7 -7
- hud_python-0.4.0.dist-info/METADATA +474 -0
- hud_python-0.4.0.dist-info/RECORD +132 -0
- hud_python-0.4.0.dist-info/entry_points.txt +3 -0
- {hud_python-0.3.5.dist-info → hud_python-0.4.0.dist-info}/licenses/LICENSE +21 -21
- hud/adapters/__init__.py +0 -8
- hud/adapters/claude/__init__.py +0 -5
- hud/adapters/claude/adapter.py +0 -180
- hud/adapters/claude/tests/__init__.py +0 -1
- hud/adapters/claude/tests/test_adapter.py +0 -519
- hud/adapters/common/__init__.py +0 -6
- hud/adapters/common/adapter.py +0 -178
- hud/adapters/common/tests/test_adapter.py +0 -289
- hud/adapters/common/types.py +0 -446
- hud/adapters/operator/__init__.py +0 -5
- hud/adapters/operator/adapter.py +0 -108
- hud/adapters/operator/tests/__init__.py +0 -1
- hud/adapters/operator/tests/test_adapter.py +0 -370
- hud/agent/__init__.py +0 -19
- hud/agent/base.py +0 -126
- hud/agent/claude.py +0 -271
- hud/agent/langchain.py +0 -215
- hud/agent/misc/__init__.py +0 -3
- hud/agent/operator.py +0 -268
- hud/agent/tests/__init__.py +0 -1
- hud/agent/tests/test_base.py +0 -202
- hud/env/__init__.py +0 -11
- hud/env/client.py +0 -35
- hud/env/docker_client.py +0 -349
- hud/env/environment.py +0 -446
- hud/env/local_docker_client.py +0 -358
- hud/env/remote_client.py +0 -212
- hud/env/remote_docker_client.py +0 -292
- hud/gym.py +0 -130
- hud/job.py +0 -773
- hud/mcp/__init__.py +0 -17
- hud/mcp/base.py +0 -631
- hud/mcp/client.py +0 -312
- hud/mcp/tests/test_base.py +0 -512
- hud/mcp/tests/test_claude.py +0 -294
- hud/task.py +0 -149
- hud/taskset.py +0 -237
- hud/telemetry/_trace.py +0 -347
- hud/telemetry/context.py +0 -230
- hud/telemetry/exporter.py +0 -575
- hud/telemetry/instrumentation/__init__.py +0 -3
- hud/telemetry/instrumentation/mcp.py +0 -259
- hud/telemetry/instrumentation/registry.py +0 -59
- hud/telemetry/mcp_models.py +0 -270
- hud/telemetry/tests/__init__.py +0 -1
- hud/telemetry/tests/test_context.py +0 -210
- hud/telemetry/tests/test_trace.py +0 -312
- hud/tools/helper/README.md +0 -56
- hud/tools/helper/__init__.py +0 -9
- hud/tools/helper/mcp_server.py +0 -78
- hud/tools/helper/server_initialization.py +0 -115
- hud/tools/helper/utils.py +0 -58
- hud/trajectory.py +0 -94
- hud/utils/agent.py +0 -37
- hud/utils/common.py +0 -256
- hud/utils/config.py +0 -120
- hud/utils/deprecation.py +0 -115
- hud/utils/misc.py +0 -53
- hud/utils/tests/test_common.py +0 -277
- hud/utils/tests/test_config.py +0 -129
- hud_python-0.3.5.dist-info/METADATA +0 -284
- hud_python-0.3.5.dist-info/RECORD +0 -120
- /hud/{adapters/common → shared}/tests/__init__.py +0 -0
- {hud_python-0.3.5.dist-info → hud_python-0.4.0.dist-info}/WHEEL +0 -0
hud/mcp/tests/test_claude.py
DELETED
|
@@ -1,294 +0,0 @@
|
|
|
1
|
-
"""Tests for Claude MCP Agent implementation."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
from typing import TYPE_CHECKING, cast
|
|
6
|
-
from unittest.mock import AsyncMock, MagicMock, patch
|
|
7
|
-
|
|
8
|
-
import pytest
|
|
9
|
-
from anthropic import BadRequestError
|
|
10
|
-
from mcp import types
|
|
11
|
-
from mcp.types import CallToolRequestParams as MCPToolCall
|
|
12
|
-
|
|
13
|
-
from hud.mcp.claude import (
|
|
14
|
-
ClaudeMCPAgent,
|
|
15
|
-
base64_to_content_block,
|
|
16
|
-
text_to_content_block,
|
|
17
|
-
tool_use_content_block,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
if TYPE_CHECKING:
|
|
21
|
-
from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
class TestClaudeHelperFunctions:
|
|
25
|
-
"""Test helper functions for Claude message formatting."""
|
|
26
|
-
|
|
27
|
-
def test_base64_to_content_block(self):
|
|
28
|
-
"""Test base64 image conversion."""
|
|
29
|
-
base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==" # noqa: E501
|
|
30
|
-
result = base64_to_content_block(base64_data)
|
|
31
|
-
|
|
32
|
-
assert result["type"] == "image"
|
|
33
|
-
assert result["source"]["type"] == "base64"
|
|
34
|
-
assert result["source"]["media_type"] == "image/png"
|
|
35
|
-
assert result["source"]["data"] == base64_data
|
|
36
|
-
|
|
37
|
-
def test_text_to_content_block(self):
|
|
38
|
-
"""Test text conversion."""
|
|
39
|
-
text = "Hello, world!"
|
|
40
|
-
result = text_to_content_block(text)
|
|
41
|
-
|
|
42
|
-
assert result["type"] == "text"
|
|
43
|
-
assert result["text"] == text
|
|
44
|
-
|
|
45
|
-
def test_tool_use_content_block(self):
|
|
46
|
-
"""Test tool result content block creation."""
|
|
47
|
-
tool_use_id = "tool_123"
|
|
48
|
-
content: list[BetaTextBlockParam | BetaImageBlockParam] = [
|
|
49
|
-
text_to_content_block("Result text")
|
|
50
|
-
]
|
|
51
|
-
|
|
52
|
-
result = tool_use_content_block(tool_use_id, content)
|
|
53
|
-
|
|
54
|
-
assert result["type"] == "tool_result"
|
|
55
|
-
assert result["tool_use_id"] == tool_use_id
|
|
56
|
-
assert result["content"] == content # type: ignore
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
class TestClaudeMCPAgent:
|
|
60
|
-
"""Test ClaudeMCPAgent class."""
|
|
61
|
-
|
|
62
|
-
@pytest.fixture
|
|
63
|
-
def mock_mcp_client(self):
|
|
64
|
-
"""Create a mock MCP client."""
|
|
65
|
-
mcp_client = MagicMock()
|
|
66
|
-
mcp_client.get_all_active_sessions = MagicMock(return_value={})
|
|
67
|
-
mcp_client.get_tool_map = MagicMock(return_value={})
|
|
68
|
-
return mcp_client
|
|
69
|
-
|
|
70
|
-
@pytest.fixture
|
|
71
|
-
def mock_anthropic(self):
|
|
72
|
-
"""Create a mock Anthropic client."""
|
|
73
|
-
with patch("hud.mcp.claude.AsyncAnthropic") as mock:
|
|
74
|
-
client = AsyncMock()
|
|
75
|
-
# Add beta attribute with messages
|
|
76
|
-
client.beta = AsyncMock()
|
|
77
|
-
client.beta.messages = AsyncMock()
|
|
78
|
-
mock.return_value = client
|
|
79
|
-
yield client
|
|
80
|
-
|
|
81
|
-
@pytest.mark.asyncio
|
|
82
|
-
async def test_init(self, mock_mcp_client, mock_anthropic):
|
|
83
|
-
"""Test agent initialization."""
|
|
84
|
-
# Test with provided model_client
|
|
85
|
-
mock_model_client = MagicMock()
|
|
86
|
-
agent = ClaudeMCPAgent(
|
|
87
|
-
mcp_client=mock_mcp_client,
|
|
88
|
-
model_client=mock_model_client,
|
|
89
|
-
model="claude-3-opus-20240229",
|
|
90
|
-
max_tokens=1000,
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
assert agent.model_name == "claude-3-opus-20240229"
|
|
94
|
-
assert agent.max_tokens == 1000
|
|
95
|
-
assert agent.anthropic_client == mock_model_client
|
|
96
|
-
|
|
97
|
-
@pytest.mark.asyncio
|
|
98
|
-
async def test_init_without_model_client(self, mock_mcp_client):
|
|
99
|
-
"""Test agent initialization without model client."""
|
|
100
|
-
with patch("hud.mcp.claude.settings.anthropic_api_key", "test_key"):
|
|
101
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model="claude-3-opus-20240229")
|
|
102
|
-
|
|
103
|
-
assert agent.model_name == "claude-3-opus-20240229"
|
|
104
|
-
assert agent.anthropic_client is not None
|
|
105
|
-
|
|
106
|
-
@pytest.mark.asyncio
|
|
107
|
-
async def test_create_initial_messages(self, mock_mcp_client):
|
|
108
|
-
"""Test creating initial messages."""
|
|
109
|
-
mock_model_client = MagicMock()
|
|
110
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
|
|
111
|
-
|
|
112
|
-
# Test with text only
|
|
113
|
-
messages = await agent.create_initial_messages("Hello, Claude!")
|
|
114
|
-
assert len(messages) == 1
|
|
115
|
-
assert messages[0]["role"] == "user"
|
|
116
|
-
content = list(messages[0]["content"])
|
|
117
|
-
assert content[0]["type"] == "text" # type: ignore
|
|
118
|
-
assert content[0]["text"] == "Hello, Claude!" # type: ignore
|
|
119
|
-
|
|
120
|
-
# Test with screenshot
|
|
121
|
-
messages = await agent.create_initial_messages("Look at this", screenshot="base64data")
|
|
122
|
-
assert len(messages) == 1
|
|
123
|
-
assert messages[0]["role"] == "user"
|
|
124
|
-
content = list(messages[0]["content"])
|
|
125
|
-
assert len(content) == 2
|
|
126
|
-
# Claude puts text first, then image
|
|
127
|
-
assert content[0]["type"] == "text" # type: ignore
|
|
128
|
-
assert content[0]["text"] == "Look at this" # type: ignore
|
|
129
|
-
assert content[1]["type"] == "image" # type: ignore
|
|
130
|
-
|
|
131
|
-
@pytest.mark.asyncio
|
|
132
|
-
async def test_format_tool_results_method(self, mock_mcp_client):
|
|
133
|
-
"""Test the agent's format_tool_results method."""
|
|
134
|
-
mock_model_client = MagicMock()
|
|
135
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
|
|
136
|
-
|
|
137
|
-
tool_calls = [
|
|
138
|
-
MCPToolCall(name="test_tool", arguments={}, tool_use_id="id1"), # type: ignore
|
|
139
|
-
]
|
|
140
|
-
|
|
141
|
-
tool_results = [
|
|
142
|
-
types.CallToolResult(
|
|
143
|
-
content=[types.TextContent(type="text", text="Success")], isError=False
|
|
144
|
-
),
|
|
145
|
-
]
|
|
146
|
-
|
|
147
|
-
messages = await agent.format_tool_results(tool_calls, tool_results)
|
|
148
|
-
|
|
149
|
-
# format_tool_results returns a single user message with tool result content
|
|
150
|
-
assert len(messages) == 1
|
|
151
|
-
assert messages[0]["role"] == "user"
|
|
152
|
-
# The content is wrapped in a tool result block
|
|
153
|
-
content = list(messages[0]["content"])
|
|
154
|
-
assert len(content) == 1
|
|
155
|
-
assert content[0]["type"] == "tool_result" # type: ignore
|
|
156
|
-
assert content[0]["tool_use_id"] == "id1" # type: ignore
|
|
157
|
-
# The actual content is nested inside
|
|
158
|
-
inner_content = list(content[0]["content"]) # type: ignore
|
|
159
|
-
assert inner_content[0]["type"] == "text" # type: ignore
|
|
160
|
-
assert inner_content[0]["text"] == "Success" # type: ignore
|
|
161
|
-
|
|
162
|
-
@pytest.mark.asyncio
|
|
163
|
-
async def test_get_model_response(self, mock_mcp_client, mock_anthropic):
|
|
164
|
-
"""Test getting model response from Claude API."""
|
|
165
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
|
|
166
|
-
|
|
167
|
-
# Mock the API response
|
|
168
|
-
mock_response = MagicMock()
|
|
169
|
-
|
|
170
|
-
# Create text block
|
|
171
|
-
text_block = MagicMock()
|
|
172
|
-
text_block.type = "text"
|
|
173
|
-
text_block.text = "Hello!"
|
|
174
|
-
|
|
175
|
-
# Create tool use block
|
|
176
|
-
tool_block = MagicMock()
|
|
177
|
-
tool_block.type = "tool_use"
|
|
178
|
-
tool_block.id = "tool_123"
|
|
179
|
-
tool_block.name = "test_tool"
|
|
180
|
-
tool_block.input = {"param": "value"}
|
|
181
|
-
|
|
182
|
-
mock_response.content = [text_block, tool_block]
|
|
183
|
-
mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
|
|
184
|
-
mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
|
|
185
|
-
|
|
186
|
-
messages = [
|
|
187
|
-
cast("BetaMessageParam", {"role": "user", "content": [{"type": "text", "text": "Hi"}]})
|
|
188
|
-
]
|
|
189
|
-
response = await agent.get_model_response(messages)
|
|
190
|
-
|
|
191
|
-
assert response.content == "Hello!"
|
|
192
|
-
assert len(response.tool_calls) == 1
|
|
193
|
-
assert response.tool_calls[0].name == "test_tool"
|
|
194
|
-
assert response.tool_calls[0].arguments == {"param": "value"}
|
|
195
|
-
# The test was checking for Claude-specific attributes that aren't part of ModelResponse
|
|
196
|
-
# These would need to be accessed from the original Claude response if needed
|
|
197
|
-
|
|
198
|
-
# Verify API was called correctly
|
|
199
|
-
mock_anthropic.beta.messages.create.assert_called_once()
|
|
200
|
-
|
|
201
|
-
@pytest.mark.asyncio
|
|
202
|
-
async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
|
|
203
|
-
"""Test getting text-only response."""
|
|
204
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
|
|
205
|
-
|
|
206
|
-
mock_response = MagicMock()
|
|
207
|
-
# Create text block
|
|
208
|
-
text_block = MagicMock()
|
|
209
|
-
text_block.type = "text"
|
|
210
|
-
text_block.text = "Just text"
|
|
211
|
-
mock_response.content = [text_block]
|
|
212
|
-
mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
|
|
213
|
-
mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
|
|
214
|
-
|
|
215
|
-
messages = [
|
|
216
|
-
cast("BetaMessageParam", {"role": "user", "content": [{"type": "text", "text": "Hi"}]})
|
|
217
|
-
]
|
|
218
|
-
response = await agent.get_model_response(messages)
|
|
219
|
-
|
|
220
|
-
assert response.content == "Just text"
|
|
221
|
-
assert response.tool_calls == []
|
|
222
|
-
|
|
223
|
-
@pytest.mark.asyncio
|
|
224
|
-
async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
|
|
225
|
-
"""Test handling API errors."""
|
|
226
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
|
|
227
|
-
|
|
228
|
-
# Mock API error
|
|
229
|
-
mock_anthropic.beta.messages.create = AsyncMock(
|
|
230
|
-
side_effect=BadRequestError(
|
|
231
|
-
message="Invalid request",
|
|
232
|
-
response=MagicMock(status_code=400),
|
|
233
|
-
body={"error": {"message": "Invalid request"}},
|
|
234
|
-
)
|
|
235
|
-
)
|
|
236
|
-
|
|
237
|
-
messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
|
|
238
|
-
|
|
239
|
-
with pytest.raises(BadRequestError):
|
|
240
|
-
await agent.get_model_response(messages) # type: ignore
|
|
241
|
-
|
|
242
|
-
@pytest.mark.asyncio
|
|
243
|
-
async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
|
|
244
|
-
"""Test running agent with tool usage."""
|
|
245
|
-
agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
|
|
246
|
-
|
|
247
|
-
# Mock tool availability
|
|
248
|
-
agent._available_tools = [
|
|
249
|
-
types.Tool(name="calculator", description="Calculator", inputSchema={"type": "object"})
|
|
250
|
-
]
|
|
251
|
-
agent._tool_map = {
|
|
252
|
-
"calculator": (
|
|
253
|
-
"server1",
|
|
254
|
-
types.Tool(
|
|
255
|
-
name="calculator", description="Calculator", inputSchema={"type": "object"}
|
|
256
|
-
),
|
|
257
|
-
)
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
# Mock initial response with tool use
|
|
261
|
-
initial_response = MagicMock()
|
|
262
|
-
# Create tool use block
|
|
263
|
-
tool_block = MagicMock()
|
|
264
|
-
tool_block.type = "tool_use"
|
|
265
|
-
tool_block.id = "calc_123"
|
|
266
|
-
tool_block.name = "calculator"
|
|
267
|
-
tool_block.input = {"operation": "add", "a": 2, "b": 3}
|
|
268
|
-
initial_response.content = [tool_block]
|
|
269
|
-
initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
|
|
270
|
-
|
|
271
|
-
# Mock follow-up response
|
|
272
|
-
final_response = MagicMock()
|
|
273
|
-
text_block = MagicMock()
|
|
274
|
-
text_block.type = "text"
|
|
275
|
-
text_block.text = "2 + 3 = 5"
|
|
276
|
-
final_response.content = [text_block]
|
|
277
|
-
final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
|
|
278
|
-
|
|
279
|
-
mock_anthropic.beta.messages.create = AsyncMock(
|
|
280
|
-
side_effect=[initial_response, final_response]
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
# Mock tool execution
|
|
284
|
-
agent.mcp_client.call_tool = AsyncMock(
|
|
285
|
-
return_value=types.CallToolResult(
|
|
286
|
-
content=[types.TextContent(type="text", text="5")], isError=False
|
|
287
|
-
)
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
# Use a string prompt instead of a task
|
|
291
|
-
result = await agent.run("What is 2 + 3?")
|
|
292
|
-
|
|
293
|
-
assert result.content == "2 + 3 = 5"
|
|
294
|
-
assert result.done is True
|
hud/task.py
DELETED
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import TYPE_CHECKING, Any, Literal, cast
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
|
|
8
|
-
from hud.types import CustomGym, Gym, MetadataKeys, SensitiveData
|
|
9
|
-
from hud.utils.common import FunctionConfigs
|
|
10
|
-
from hud.utils.deprecation import deprecated
|
|
11
|
-
|
|
12
|
-
if TYPE_CHECKING:
|
|
13
|
-
from hud.agent import Agent
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@deprecated(
|
|
17
|
-
reason="Task class is being replaced by TaskConfig for better MCP integration",
|
|
18
|
-
replacement="hud.datasets.TaskConfig",
|
|
19
|
-
version="0.3.0",
|
|
20
|
-
removal_version="0.4.0",
|
|
21
|
-
)
|
|
22
|
-
class Task(BaseModel):
|
|
23
|
-
"""A task that can be executed and evaluated.
|
|
24
|
-
|
|
25
|
-
A Task represents a specific activity to be performed in an environment.
|
|
26
|
-
It contains the prompt describing the task and configurations for
|
|
27
|
-
setting up and evaluating the environment.
|
|
28
|
-
|
|
29
|
-
The setup and evaluate configurations can be in several formats:
|
|
30
|
-
- String (function name): "chrome.maximize"
|
|
31
|
-
- Tuple (function with args): ("chrome.activate_tab", 5)
|
|
32
|
-
- Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
|
|
33
|
-
- List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
|
|
34
|
-
|
|
35
|
-
Attributes:
|
|
36
|
-
id: The remote task ID (optional if local-only)
|
|
37
|
-
prompt: The task prompt or instruction
|
|
38
|
-
system_prompt: The system prompt for the evalset (optional)
|
|
39
|
-
setup: Environment setup configuration (optional)
|
|
40
|
-
evaluate: Configuration for evaluating responses
|
|
41
|
-
metadata: Additional task metadata
|
|
42
|
-
sensitive_data: Sensitive data such as API keys, passwords, etc.
|
|
43
|
-
choices: Multiple choice answer list (for Inspect compatibility)
|
|
44
|
-
target: Ideal target output (for Inspect compatibility)
|
|
45
|
-
files: Files that go along with the task (for Inspect compatibility)
|
|
46
|
-
gym: Environment specification
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
id: str | None = None # Remote task ID (optional if local-only)
|
|
50
|
-
|
|
51
|
-
prompt: str # Task prompt or instruction
|
|
52
|
-
system_prompt: str | None = None # System prompt for the evalset (optional)
|
|
53
|
-
|
|
54
|
-
gym: Gym | None = None # Environment specification
|
|
55
|
-
|
|
56
|
-
# Setup and evaluate configurations for the environment (environment specific)
|
|
57
|
-
setup: FunctionConfigs | None = None
|
|
58
|
-
evaluate: FunctionConfigs | None = None
|
|
59
|
-
|
|
60
|
-
# Overflow configuration for environments that don't conform to the standard
|
|
61
|
-
config: dict[str, Any] | None = None
|
|
62
|
-
|
|
63
|
-
# Sensitive data such as API keys, passwords, etc.
|
|
64
|
-
sensitive_data: SensitiveData = Field(default_factory=dict)
|
|
65
|
-
|
|
66
|
-
# Metadata for the task evaluation, information about the agent (see MetadataKeys)
|
|
67
|
-
metadata: dict[MetadataKeys, Any] = Field(default_factory=dict)
|
|
68
|
-
|
|
69
|
-
# Description of the task, for extra information about its purpose and context
|
|
70
|
-
description: str | None = None
|
|
71
|
-
|
|
72
|
-
# Gold file url for the task
|
|
73
|
-
gold_file_url: str | None = None
|
|
74
|
-
|
|
75
|
-
@classmethod
|
|
76
|
-
def from_dict(cls, data: dict[str, Any]) -> Task:
|
|
77
|
-
return cls(**data)
|
|
78
|
-
|
|
79
|
-
@classmethod
|
|
80
|
-
def from_serialized(cls, data: dict[str, Any]) -> Task:
|
|
81
|
-
gym_data = data.get("gym")
|
|
82
|
-
parsed_gym: Gym | None = gym_data
|
|
83
|
-
|
|
84
|
-
parsed_setup = [(param, entry) for param, entry in data.get("setup", [])]
|
|
85
|
-
parsed_evaluate = [(param, entry) for param, entry in data.get("evaluate", [])]
|
|
86
|
-
|
|
87
|
-
# Convert dict gym data to CustomGym if needed
|
|
88
|
-
if (
|
|
89
|
-
isinstance(gym_data, dict)
|
|
90
|
-
and gym_data.get("type") == "public"
|
|
91
|
-
and gym_data.get("location") in ("local", "remote")
|
|
92
|
-
and gym_data.get("image_or_build_context") is not None
|
|
93
|
-
):
|
|
94
|
-
parsed_gym = CustomGym(
|
|
95
|
-
type=cast("Literal['public']", gym_data["type"]),
|
|
96
|
-
location=cast("Literal['local', 'remote']", gym_data["location"]),
|
|
97
|
-
image_or_build_context=Path(gym_data["image_or_build_context"]),
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
return cls(
|
|
101
|
-
id=data.get("id"),
|
|
102
|
-
prompt=data.get("prompt", ""),
|
|
103
|
-
system_prompt=data.get("system_prompt"),
|
|
104
|
-
setup=parsed_setup,
|
|
105
|
-
evaluate=parsed_evaluate,
|
|
106
|
-
gym=parsed_gym,
|
|
107
|
-
config=data.get("config"),
|
|
108
|
-
description=data.get("description"),
|
|
109
|
-
sensitive_data=data.get("sensitive_data", {}),
|
|
110
|
-
metadata=data.get("metadata", {}),
|
|
111
|
-
gold_file_url=data.get("gold_file_url"),
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
async def fit(self, agent: Agent | type[Agent]) -> None:
|
|
115
|
-
if isinstance(agent, type):
|
|
116
|
-
agent = agent()
|
|
117
|
-
|
|
118
|
-
if self.gym is None:
|
|
119
|
-
return
|
|
120
|
-
self.gym = agent.transfer_gyms.get(self.gym, self.gym)
|
|
121
|
-
|
|
122
|
-
def serialize(self) -> dict[str, Any]:
|
|
123
|
-
if isinstance(self.setup, list):
|
|
124
|
-
parsed_setup = [[param, entry] for param, entry in self.setup]
|
|
125
|
-
else:
|
|
126
|
-
parsed_setup = self.setup
|
|
127
|
-
if isinstance(self.evaluate, list):
|
|
128
|
-
parsed_evaluate = [[param, entry] for param, entry in self.evaluate]
|
|
129
|
-
else:
|
|
130
|
-
parsed_evaluate = self.evaluate
|
|
131
|
-
|
|
132
|
-
if isinstance(self.gym, CustomGym):
|
|
133
|
-
parsed_gym = self.gym.model_dump()
|
|
134
|
-
parsed_gym["image_or_build_context"] = str(parsed_gym["image_or_build_context"])
|
|
135
|
-
else: # is ServerGym
|
|
136
|
-
parsed_gym = self.gym
|
|
137
|
-
|
|
138
|
-
return {
|
|
139
|
-
"id": self.id,
|
|
140
|
-
"prompt": self.prompt,
|
|
141
|
-
"config": self.config,
|
|
142
|
-
"description": self.description,
|
|
143
|
-
"setup": parsed_setup,
|
|
144
|
-
"evaluate": parsed_evaluate,
|
|
145
|
-
"gym": parsed_gym,
|
|
146
|
-
"sensitive_data": self.sensitive_data,
|
|
147
|
-
"metadata": self.metadata,
|
|
148
|
-
"gold_file_url": self.gold_file_url,
|
|
149
|
-
}
|
hud/taskset.py
DELETED
|
@@ -1,237 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import TYPE_CHECKING, Any, get_args
|
|
6
|
-
|
|
7
|
-
from pydantic import BaseModel
|
|
8
|
-
|
|
9
|
-
from hud.env.environment import create_remote_config
|
|
10
|
-
from hud.server import make_request
|
|
11
|
-
from hud.settings import settings
|
|
12
|
-
from hud.task import Task
|
|
13
|
-
from hud.types import CustomGym, ServerGym
|
|
14
|
-
from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
|
|
15
|
-
from hud.utils.deprecation import deprecated
|
|
16
|
-
|
|
17
|
-
logger = logging.getLogger(__name__)
|
|
18
|
-
|
|
19
|
-
if TYPE_CHECKING:
|
|
20
|
-
from collections.abc import Iterator
|
|
21
|
-
|
|
22
|
-
from hud.agent import Agent
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
@deprecated(
|
|
26
|
-
reason="TaskSet class is being replaced by HuggingFace datasets on hud-evals",
|
|
27
|
-
replacement="Use TaskConfig-based collections or hud.datasets module",
|
|
28
|
-
version="0.3.0",
|
|
29
|
-
removal_version="0.4.0",
|
|
30
|
-
)
|
|
31
|
-
class TaskSet(BaseModel):
|
|
32
|
-
"""
|
|
33
|
-
Collection of related tasks for benchmarking.
|
|
34
|
-
|
|
35
|
-
Attributes:
|
|
36
|
-
id: Unique identifier for the taskset
|
|
37
|
-
name: Name of the taskset
|
|
38
|
-
description: Description of the taskset
|
|
39
|
-
tasks: List of Task objects in the taskset
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
id: str | None = None
|
|
43
|
-
name: str | None = None
|
|
44
|
-
description: str | None = None
|
|
45
|
-
tasks: list[Task] = []
|
|
46
|
-
|
|
47
|
-
def __getitem__(self, index: int) -> Task:
|
|
48
|
-
"""
|
|
49
|
-
Allows accessing tasks by index using square bracket notation.
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
index: The index of the task to retrieve
|
|
53
|
-
|
|
54
|
-
Returns:
|
|
55
|
-
Task: The task at the specified index
|
|
56
|
-
|
|
57
|
-
Raises:
|
|
58
|
-
IndexError: If the index is out of range
|
|
59
|
-
"""
|
|
60
|
-
return self.tasks[index]
|
|
61
|
-
|
|
62
|
-
def __len__(self) -> int:
|
|
63
|
-
"""
|
|
64
|
-
Returns the number of tasks in the taskset.
|
|
65
|
-
|
|
66
|
-
Returns:
|
|
67
|
-
int: The number of tasks in the taskset
|
|
68
|
-
"""
|
|
69
|
-
return len(self.tasks)
|
|
70
|
-
|
|
71
|
-
def __iter__(self) -> Iterator[Task]:
|
|
72
|
-
"""
|
|
73
|
-
Returns an iterator over the tasks in the taskset.
|
|
74
|
-
"""
|
|
75
|
-
return iter(self.tasks)
|
|
76
|
-
|
|
77
|
-
async def upload(
|
|
78
|
-
self,
|
|
79
|
-
name: str | None = None,
|
|
80
|
-
description: str | None = None,
|
|
81
|
-
api_key: str | None = None,
|
|
82
|
-
) -> None:
|
|
83
|
-
"""
|
|
84
|
-
Uploads the taskset to the server.
|
|
85
|
-
"""
|
|
86
|
-
if name is None:
|
|
87
|
-
name = self.name
|
|
88
|
-
|
|
89
|
-
if name is None:
|
|
90
|
-
raise ValueError("Taskset name is required")
|
|
91
|
-
|
|
92
|
-
if api_key is None:
|
|
93
|
-
api_key = settings.api_key
|
|
94
|
-
|
|
95
|
-
# Convert all tasks to expanded configs
|
|
96
|
-
processed_tasks = []
|
|
97
|
-
for task in self.tasks:
|
|
98
|
-
if task.setup is not None:
|
|
99
|
-
setup_config = (
|
|
100
|
-
create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0].model_dump()
|
|
101
|
-
)
|
|
102
|
-
else:
|
|
103
|
-
setup_config = None
|
|
104
|
-
if task.evaluate is not None:
|
|
105
|
-
evaluate_config = (
|
|
106
|
-
create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0]
|
|
107
|
-
.args[0]
|
|
108
|
-
.model_dump()
|
|
109
|
-
)
|
|
110
|
-
else:
|
|
111
|
-
evaluate_config = None
|
|
112
|
-
|
|
113
|
-
if isinstance(task.gym, CustomGym):
|
|
114
|
-
if isinstance(task.gym.image_or_build_context, Path):
|
|
115
|
-
raise ValueError(
|
|
116
|
-
"Local build contexts are not supported for "
|
|
117
|
-
"remote tasksets, attach an image or existing "
|
|
118
|
-
"gym id."
|
|
119
|
-
)
|
|
120
|
-
gym_str = "docker"
|
|
121
|
-
image_uri = task.gym.image_or_build_context
|
|
122
|
-
elif isinstance(task.gym, str) and task.gym in get_args(ServerGym):
|
|
123
|
-
gym_str = task.gym
|
|
124
|
-
image_uri = None
|
|
125
|
-
else:
|
|
126
|
-
raise ValueError(f"Unknown gym type: {type(task.gym)}")
|
|
127
|
-
|
|
128
|
-
processed_tasks.append(
|
|
129
|
-
{
|
|
130
|
-
"prompt": task.prompt,
|
|
131
|
-
"gym": gym_str,
|
|
132
|
-
"setup": setup_config,
|
|
133
|
-
"evaluate": evaluate_config,
|
|
134
|
-
"config": task.config,
|
|
135
|
-
"image_uri": image_uri,
|
|
136
|
-
"description": task.description,
|
|
137
|
-
}
|
|
138
|
-
)
|
|
139
|
-
|
|
140
|
-
await make_request(
|
|
141
|
-
method="POST",
|
|
142
|
-
url=f"{settings.base_url}/v2/tasksets",
|
|
143
|
-
api_key=api_key,
|
|
144
|
-
json={
|
|
145
|
-
"name": name,
|
|
146
|
-
"description": description,
|
|
147
|
-
"tasks": processed_tasks,
|
|
148
|
-
},
|
|
149
|
-
)
|
|
150
|
-
logger.info(
|
|
151
|
-
"Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
def _apply(self, dict: dict[str, Any]) -> None:
|
|
155
|
-
"""
|
|
156
|
-
Applies a parameter to all tasks in the taskset.
|
|
157
|
-
"""
|
|
158
|
-
for task in self.tasks:
|
|
159
|
-
for key, value in dict.items():
|
|
160
|
-
setattr(task, key, value)
|
|
161
|
-
|
|
162
|
-
def fit(self, agent: Agent | type[Agent]) -> None:
|
|
163
|
-
"""
|
|
164
|
-
Automatically adapts the taskset to the agent's transfer_gyms.
|
|
165
|
-
"""
|
|
166
|
-
if isinstance(agent, type):
|
|
167
|
-
agent = agent()
|
|
168
|
-
|
|
169
|
-
for task in self.tasks:
|
|
170
|
-
if task.gym is None or isinstance(task.gym, CustomGym):
|
|
171
|
-
continue
|
|
172
|
-
task.gym = agent.transfer_gyms.get(task.gym, task.gym)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
@deprecated(
|
|
176
|
-
reason="load_taskset is being replaced by new dataset loading mechanisms for MCP-based tasks",
|
|
177
|
-
replacement="Use TaskConfig-based dataset loading from hud.datasets module",
|
|
178
|
-
version="0.3.0",
|
|
179
|
-
removal_version="0.4.0",
|
|
180
|
-
)
|
|
181
|
-
async def load_taskset(
|
|
182
|
-
taskset_id: str,
|
|
183
|
-
api_key: str | None = None,
|
|
184
|
-
metadata: dict[str, Any] | None = None,
|
|
185
|
-
load_custom_as_local: bool = False,
|
|
186
|
-
system_prompt: str | None = None,
|
|
187
|
-
) -> TaskSet:
|
|
188
|
-
"""
|
|
189
|
-
Loads a TaskSet by its ID.
|
|
190
|
-
|
|
191
|
-
Args:
|
|
192
|
-
taskset_id: The ID of the taskset to load
|
|
193
|
-
api_key: Optional API key to use for the request
|
|
194
|
-
metadata: Optional metadata to apply to the taskset
|
|
195
|
-
load_custom_as_local: Whether to load custom gyms as local
|
|
196
|
-
system_prompt: Optional system prompt to override the default
|
|
197
|
-
Returns:
|
|
198
|
-
TaskSet: The loaded taskset
|
|
199
|
-
"""
|
|
200
|
-
|
|
201
|
-
if api_key is None:
|
|
202
|
-
api_key = settings.api_key
|
|
203
|
-
|
|
204
|
-
data = await make_request(
|
|
205
|
-
method="GET",
|
|
206
|
-
url=f"{settings.base_url}/v2/tasksets/{taskset_id}/tasks",
|
|
207
|
-
api_key=api_key,
|
|
208
|
-
)
|
|
209
|
-
|
|
210
|
-
logger.info("Taskset %s loaded successfully", taskset_id)
|
|
211
|
-
|
|
212
|
-
tasks = data["evalset"]
|
|
213
|
-
for task in tasks:
|
|
214
|
-
if system_prompt:
|
|
215
|
-
task["system_prompt"] = system_prompt
|
|
216
|
-
if task["gym"] == "docker":
|
|
217
|
-
if "image_uri" not in task:
|
|
218
|
-
raise ValueError(
|
|
219
|
-
"No `image_uri` key found. This taskset may be "
|
|
220
|
-
"incompatible with your version of HUD SDK."
|
|
221
|
-
)
|
|
222
|
-
|
|
223
|
-
task["gym"] = CustomGym(
|
|
224
|
-
location="local" if load_custom_as_local else "remote",
|
|
225
|
-
image_or_build_context=task["image_uri"],
|
|
226
|
-
)
|
|
227
|
-
|
|
228
|
-
taskset = TaskSet.model_validate(
|
|
229
|
-
{
|
|
230
|
-
"id": taskset_id,
|
|
231
|
-
"tasks": tasks,
|
|
232
|
-
}
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
taskset._apply({"metadata": metadata})
|
|
236
|
-
|
|
237
|
-
return taskset
|