hud-python 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (192) hide show
  1. hud/__init__.py +22 -89
  2. hud/agents/__init__.py +17 -0
  3. hud/agents/art.py +101 -0
  4. hud/agents/base.py +599 -0
  5. hud/{mcp → agents}/claude.py +373 -321
  6. hud/{mcp → agents}/langchain.py +250 -250
  7. hud/agents/misc/__init__.py +7 -0
  8. hud/{agent → agents}/misc/response_agent.py +80 -80
  9. hud/{mcp → agents}/openai.py +352 -334
  10. hud/agents/openai_chat_generic.py +154 -0
  11. hud/{mcp → agents}/tests/__init__.py +1 -1
  12. hud/agents/tests/test_base.py +742 -0
  13. hud/agents/tests/test_claude.py +324 -0
  14. hud/{mcp → agents}/tests/test_client.py +363 -324
  15. hud/{mcp → agents}/tests/test_openai.py +237 -238
  16. hud/cli/__init__.py +617 -0
  17. hud/cli/__main__.py +8 -0
  18. hud/cli/analyze.py +371 -0
  19. hud/cli/analyze_metadata.py +230 -0
  20. hud/cli/build.py +427 -0
  21. hud/cli/clone.py +185 -0
  22. hud/cli/cursor.py +92 -0
  23. hud/cli/debug.py +392 -0
  24. hud/cli/docker_utils.py +83 -0
  25. hud/cli/init.py +281 -0
  26. hud/cli/interactive.py +353 -0
  27. hud/cli/mcp_server.py +756 -0
  28. hud/cli/pull.py +336 -0
  29. hud/cli/push.py +379 -0
  30. hud/cli/remote_runner.py +311 -0
  31. hud/cli/runner.py +160 -0
  32. hud/cli/tests/__init__.py +3 -0
  33. hud/cli/tests/test_analyze.py +284 -0
  34. hud/cli/tests/test_cli_init.py +265 -0
  35. hud/cli/tests/test_cli_main.py +27 -0
  36. hud/cli/tests/test_clone.py +142 -0
  37. hud/cli/tests/test_cursor.py +253 -0
  38. hud/cli/tests/test_debug.py +453 -0
  39. hud/cli/tests/test_mcp_server.py +139 -0
  40. hud/cli/tests/test_utils.py +388 -0
  41. hud/cli/utils.py +263 -0
  42. hud/clients/README.md +143 -0
  43. hud/clients/__init__.py +16 -0
  44. hud/clients/base.py +354 -0
  45. hud/clients/fastmcp.py +202 -0
  46. hud/clients/mcp_use.py +278 -0
  47. hud/clients/tests/__init__.py +1 -0
  48. hud/clients/tests/test_client_integration.py +111 -0
  49. hud/clients/tests/test_fastmcp.py +342 -0
  50. hud/clients/tests/test_protocol.py +188 -0
  51. hud/clients/utils/__init__.py +1 -0
  52. hud/clients/utils/retry_transport.py +160 -0
  53. hud/datasets.py +322 -192
  54. hud/misc/__init__.py +1 -0
  55. hud/{agent → misc}/claude_plays_pokemon.py +292 -283
  56. hud/otel/__init__.py +35 -0
  57. hud/otel/collector.py +142 -0
  58. hud/otel/config.py +164 -0
  59. hud/otel/context.py +536 -0
  60. hud/otel/exporters.py +366 -0
  61. hud/otel/instrumentation.py +97 -0
  62. hud/otel/processors.py +118 -0
  63. hud/otel/tests/__init__.py +1 -0
  64. hud/otel/tests/test_processors.py +197 -0
  65. hud/server/__init__.py +5 -5
  66. hud/server/context.py +114 -0
  67. hud/server/helper/__init__.py +5 -0
  68. hud/server/low_level.py +132 -0
  69. hud/server/server.py +166 -0
  70. hud/server/tests/__init__.py +3 -0
  71. hud/settings.py +73 -79
  72. hud/shared/__init__.py +5 -0
  73. hud/{exceptions.py → shared/exceptions.py} +180 -180
  74. hud/{server → shared}/requests.py +264 -264
  75. hud/shared/tests/test_exceptions.py +157 -0
  76. hud/{server → shared}/tests/test_requests.py +275 -275
  77. hud/telemetry/__init__.py +25 -30
  78. hud/telemetry/instrument.py +379 -0
  79. hud/telemetry/job.py +309 -141
  80. hud/telemetry/replay.py +74 -0
  81. hud/telemetry/trace.py +83 -0
  82. hud/tools/__init__.py +33 -34
  83. hud/tools/base.py +365 -65
  84. hud/tools/bash.py +161 -137
  85. hud/tools/computer/__init__.py +15 -13
  86. hud/tools/computer/anthropic.py +437 -414
  87. hud/tools/computer/hud.py +376 -328
  88. hud/tools/computer/openai.py +295 -286
  89. hud/tools/computer/settings.py +82 -0
  90. hud/tools/edit.py +314 -290
  91. hud/tools/executors/__init__.py +30 -30
  92. hud/tools/executors/base.py +539 -532
  93. hud/tools/executors/pyautogui.py +621 -619
  94. hud/tools/executors/tests/__init__.py +1 -1
  95. hud/tools/executors/tests/test_base_executor.py +338 -338
  96. hud/tools/executors/tests/test_pyautogui_executor.py +165 -165
  97. hud/tools/executors/xdo.py +511 -503
  98. hud/tools/{playwright_tool.py → playwright.py} +412 -379
  99. hud/tools/tests/__init__.py +3 -3
  100. hud/tools/tests/test_base.py +282 -0
  101. hud/tools/tests/test_bash.py +158 -152
  102. hud/tools/tests/test_bash_extended.py +197 -0
  103. hud/tools/tests/test_computer.py +425 -52
  104. hud/tools/tests/test_computer_actions.py +34 -34
  105. hud/tools/tests/test_edit.py +259 -240
  106. hud/tools/tests/test_init.py +27 -27
  107. hud/tools/tests/test_playwright_tool.py +183 -183
  108. hud/tools/tests/test_tools.py +145 -157
  109. hud/tools/tests/test_utils.py +156 -156
  110. hud/tools/types.py +72 -0
  111. hud/tools/utils.py +50 -50
  112. hud/types.py +136 -89
  113. hud/utils/__init__.py +10 -16
  114. hud/utils/async_utils.py +65 -0
  115. hud/utils/design.py +168 -0
  116. hud/utils/mcp.py +55 -0
  117. hud/utils/progress.py +149 -149
  118. hud/utils/telemetry.py +66 -66
  119. hud/utils/tests/test_async_utils.py +173 -0
  120. hud/utils/tests/test_init.py +17 -21
  121. hud/utils/tests/test_progress.py +261 -225
  122. hud/utils/tests/test_telemetry.py +82 -37
  123. hud/utils/tests/test_version.py +8 -8
  124. hud/version.py +7 -7
  125. hud_python-0.4.0.dist-info/METADATA +474 -0
  126. hud_python-0.4.0.dist-info/RECORD +132 -0
  127. hud_python-0.4.0.dist-info/entry_points.txt +3 -0
  128. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/licenses/LICENSE +21 -21
  129. hud/adapters/__init__.py +0 -8
  130. hud/adapters/claude/__init__.py +0 -5
  131. hud/adapters/claude/adapter.py +0 -180
  132. hud/adapters/claude/tests/__init__.py +0 -1
  133. hud/adapters/claude/tests/test_adapter.py +0 -519
  134. hud/adapters/common/__init__.py +0 -6
  135. hud/adapters/common/adapter.py +0 -178
  136. hud/adapters/common/tests/test_adapter.py +0 -289
  137. hud/adapters/common/types.py +0 -446
  138. hud/adapters/operator/__init__.py +0 -5
  139. hud/adapters/operator/adapter.py +0 -108
  140. hud/adapters/operator/tests/__init__.py +0 -1
  141. hud/adapters/operator/tests/test_adapter.py +0 -370
  142. hud/agent/__init__.py +0 -19
  143. hud/agent/base.py +0 -126
  144. hud/agent/claude.py +0 -271
  145. hud/agent/langchain.py +0 -215
  146. hud/agent/misc/__init__.py +0 -3
  147. hud/agent/operator.py +0 -268
  148. hud/agent/tests/__init__.py +0 -1
  149. hud/agent/tests/test_base.py +0 -202
  150. hud/env/__init__.py +0 -11
  151. hud/env/client.py +0 -35
  152. hud/env/docker_client.py +0 -349
  153. hud/env/environment.py +0 -446
  154. hud/env/local_docker_client.py +0 -358
  155. hud/env/remote_client.py +0 -212
  156. hud/env/remote_docker_client.py +0 -292
  157. hud/gym.py +0 -130
  158. hud/job.py +0 -773
  159. hud/mcp/__init__.py +0 -17
  160. hud/mcp/base.py +0 -631
  161. hud/mcp/client.py +0 -312
  162. hud/mcp/tests/test_base.py +0 -512
  163. hud/mcp/tests/test_claude.py +0 -294
  164. hud/task.py +0 -149
  165. hud/taskset.py +0 -237
  166. hud/telemetry/_trace.py +0 -347
  167. hud/telemetry/context.py +0 -230
  168. hud/telemetry/exporter.py +0 -575
  169. hud/telemetry/instrumentation/__init__.py +0 -3
  170. hud/telemetry/instrumentation/mcp.py +0 -259
  171. hud/telemetry/instrumentation/registry.py +0 -59
  172. hud/telemetry/mcp_models.py +0 -270
  173. hud/telemetry/tests/__init__.py +0 -1
  174. hud/telemetry/tests/test_context.py +0 -210
  175. hud/telemetry/tests/test_trace.py +0 -312
  176. hud/tools/helper/README.md +0 -56
  177. hud/tools/helper/__init__.py +0 -9
  178. hud/tools/helper/mcp_server.py +0 -78
  179. hud/tools/helper/server_initialization.py +0 -115
  180. hud/tools/helper/utils.py +0 -58
  181. hud/trajectory.py +0 -94
  182. hud/utils/agent.py +0 -37
  183. hud/utils/common.py +0 -256
  184. hud/utils/config.py +0 -120
  185. hud/utils/deprecation.py +0 -115
  186. hud/utils/misc.py +0 -53
  187. hud/utils/tests/test_common.py +0 -277
  188. hud/utils/tests/test_config.py +0 -129
  189. hud_python-0.3.4.dist-info/METADATA +0 -284
  190. hud_python-0.3.4.dist-info/RECORD +0 -120
  191. /hud/{adapters/common → shared}/tests/__init__.py +0 -0
  192. {hud_python-0.3.4.dist-info → hud_python-0.4.0.dist-info}/WHEEL +0 -0
@@ -1,294 +0,0 @@
1
- """Tests for Claude MCP Agent implementation."""
2
-
3
- from __future__ import annotations
4
-
5
- from typing import TYPE_CHECKING, cast
6
- from unittest.mock import AsyncMock, MagicMock, patch
7
-
8
- import pytest
9
- from anthropic import BadRequestError
10
- from mcp import types
11
- from mcp.types import CallToolRequestParams as MCPToolCall
12
-
13
- from hud.mcp.claude import (
14
- ClaudeMCPAgent,
15
- base64_to_content_block,
16
- text_to_content_block,
17
- tool_use_content_block,
18
- )
19
-
20
- if TYPE_CHECKING:
21
- from anthropic.types.beta import BetaImageBlockParam, BetaMessageParam, BetaTextBlockParam
22
-
23
-
24
- class TestClaudeHelperFunctions:
25
- """Test helper functions for Claude message formatting."""
26
-
27
- def test_base64_to_content_block(self):
28
- """Test base64 image conversion."""
29
- base64_data = "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==" # noqa: E501
30
- result = base64_to_content_block(base64_data)
31
-
32
- assert result["type"] == "image"
33
- assert result["source"]["type"] == "base64"
34
- assert result["source"]["media_type"] == "image/png"
35
- assert result["source"]["data"] == base64_data
36
-
37
- def test_text_to_content_block(self):
38
- """Test text conversion."""
39
- text = "Hello, world!"
40
- result = text_to_content_block(text)
41
-
42
- assert result["type"] == "text"
43
- assert result["text"] == text
44
-
45
- def test_tool_use_content_block(self):
46
- """Test tool result content block creation."""
47
- tool_use_id = "tool_123"
48
- content: list[BetaTextBlockParam | BetaImageBlockParam] = [
49
- text_to_content_block("Result text")
50
- ]
51
-
52
- result = tool_use_content_block(tool_use_id, content)
53
-
54
- assert result["type"] == "tool_result"
55
- assert result["tool_use_id"] == tool_use_id
56
- assert result["content"] == content # type: ignore
57
-
58
-
59
- class TestClaudeMCPAgent:
60
- """Test ClaudeMCPAgent class."""
61
-
62
- @pytest.fixture
63
- def mock_mcp_client(self):
64
- """Create a mock MCP client."""
65
- mcp_client = MagicMock()
66
- mcp_client.get_all_active_sessions = MagicMock(return_value={})
67
- mcp_client.get_tool_map = MagicMock(return_value={})
68
- return mcp_client
69
-
70
- @pytest.fixture
71
- def mock_anthropic(self):
72
- """Create a mock Anthropic client."""
73
- with patch("hud.mcp.claude.AsyncAnthropic") as mock:
74
- client = AsyncMock()
75
- # Add beta attribute with messages
76
- client.beta = AsyncMock()
77
- client.beta.messages = AsyncMock()
78
- mock.return_value = client
79
- yield client
80
-
81
- @pytest.mark.asyncio
82
- async def test_init(self, mock_mcp_client, mock_anthropic):
83
- """Test agent initialization."""
84
- # Test with provided model_client
85
- mock_model_client = MagicMock()
86
- agent = ClaudeMCPAgent(
87
- mcp_client=mock_mcp_client,
88
- model_client=mock_model_client,
89
- model="claude-3-opus-20240229",
90
- max_tokens=1000,
91
- )
92
-
93
- assert agent.model_name == "claude-3-opus-20240229"
94
- assert agent.max_tokens == 1000
95
- assert agent.anthropic_client == mock_model_client
96
-
97
- @pytest.mark.asyncio
98
- async def test_init_without_model_client(self, mock_mcp_client):
99
- """Test agent initialization without model client."""
100
- with patch("hud.mcp.claude.settings.anthropic_api_key", "test_key"):
101
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model="claude-3-opus-20240229")
102
-
103
- assert agent.model_name == "claude-3-opus-20240229"
104
- assert agent.anthropic_client is not None
105
-
106
- @pytest.mark.asyncio
107
- async def test_create_initial_messages(self, mock_mcp_client):
108
- """Test creating initial messages."""
109
- mock_model_client = MagicMock()
110
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
111
-
112
- # Test with text only
113
- messages = await agent.create_initial_messages("Hello, Claude!")
114
- assert len(messages) == 1
115
- assert messages[0]["role"] == "user"
116
- content = list(messages[0]["content"])
117
- assert content[0]["type"] == "text" # type: ignore
118
- assert content[0]["text"] == "Hello, Claude!" # type: ignore
119
-
120
- # Test with screenshot
121
- messages = await agent.create_initial_messages("Look at this", screenshot="base64data")
122
- assert len(messages) == 1
123
- assert messages[0]["role"] == "user"
124
- content = list(messages[0]["content"])
125
- assert len(content) == 2
126
- # Claude puts text first, then image
127
- assert content[0]["type"] == "text" # type: ignore
128
- assert content[0]["text"] == "Look at this" # type: ignore
129
- assert content[1]["type"] == "image" # type: ignore
130
-
131
- @pytest.mark.asyncio
132
- async def test_format_tool_results_method(self, mock_mcp_client):
133
- """Test the agent's format_tool_results method."""
134
- mock_model_client = MagicMock()
135
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_model_client)
136
-
137
- tool_calls = [
138
- MCPToolCall(name="test_tool", arguments={}, tool_use_id="id1"), # type: ignore
139
- ]
140
-
141
- tool_results = [
142
- types.CallToolResult(
143
- content=[types.TextContent(type="text", text="Success")], isError=False
144
- ),
145
- ]
146
-
147
- messages = await agent.format_tool_results(tool_calls, tool_results)
148
-
149
- # format_tool_results returns a single user message with tool result content
150
- assert len(messages) == 1
151
- assert messages[0]["role"] == "user"
152
- # The content is wrapped in a tool result block
153
- content = list(messages[0]["content"])
154
- assert len(content) == 1
155
- assert content[0]["type"] == "tool_result" # type: ignore
156
- assert content[0]["tool_use_id"] == "id1" # type: ignore
157
- # The actual content is nested inside
158
- inner_content = list(content[0]["content"]) # type: ignore
159
- assert inner_content[0]["type"] == "text" # type: ignore
160
- assert inner_content[0]["text"] == "Success" # type: ignore
161
-
162
- @pytest.mark.asyncio
163
- async def test_get_model_response(self, mock_mcp_client, mock_anthropic):
164
- """Test getting model response from Claude API."""
165
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
166
-
167
- # Mock the API response
168
- mock_response = MagicMock()
169
-
170
- # Create text block
171
- text_block = MagicMock()
172
- text_block.type = "text"
173
- text_block.text = "Hello!"
174
-
175
- # Create tool use block
176
- tool_block = MagicMock()
177
- tool_block.type = "tool_use"
178
- tool_block.id = "tool_123"
179
- tool_block.name = "test_tool"
180
- tool_block.input = {"param": "value"}
181
-
182
- mock_response.content = [text_block, tool_block]
183
- mock_response.usage = MagicMock(input_tokens=10, output_tokens=20)
184
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
185
-
186
- messages = [
187
- cast("BetaMessageParam", {"role": "user", "content": [{"type": "text", "text": "Hi"}]})
188
- ]
189
- response = await agent.get_model_response(messages)
190
-
191
- assert response.content == "Hello!"
192
- assert len(response.tool_calls) == 1
193
- assert response.tool_calls[0].name == "test_tool"
194
- assert response.tool_calls[0].arguments == {"param": "value"}
195
- # The test was checking for Claude-specific attributes that aren't part of ModelResponse
196
- # These would need to be accessed from the original Claude response if needed
197
-
198
- # Verify API was called correctly
199
- mock_anthropic.beta.messages.create.assert_called_once()
200
-
201
- @pytest.mark.asyncio
202
- async def test_get_model_response_text_only(self, mock_mcp_client, mock_anthropic):
203
- """Test getting text-only response."""
204
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
205
-
206
- mock_response = MagicMock()
207
- # Create text block
208
- text_block = MagicMock()
209
- text_block.type = "text"
210
- text_block.text = "Just text"
211
- mock_response.content = [text_block]
212
- mock_response.usage = MagicMock(input_tokens=5, output_tokens=10)
213
- mock_anthropic.beta.messages.create = AsyncMock(return_value=mock_response)
214
-
215
- messages = [
216
- cast("BetaMessageParam", {"role": "user", "content": [{"type": "text", "text": "Hi"}]})
217
- ]
218
- response = await agent.get_model_response(messages)
219
-
220
- assert response.content == "Just text"
221
- assert response.tool_calls == []
222
-
223
- @pytest.mark.asyncio
224
- async def test_get_model_response_error(self, mock_mcp_client, mock_anthropic):
225
- """Test handling API errors."""
226
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
227
-
228
- # Mock API error
229
- mock_anthropic.beta.messages.create = AsyncMock(
230
- side_effect=BadRequestError(
231
- message="Invalid request",
232
- response=MagicMock(status_code=400),
233
- body={"error": {"message": "Invalid request"}},
234
- )
235
- )
236
-
237
- messages = [{"role": "user", "content": [{"type": "text", "text": "Hi"}]}]
238
-
239
- with pytest.raises(BadRequestError):
240
- await agent.get_model_response(messages) # type: ignore
241
-
242
- @pytest.mark.asyncio
243
- async def test_run_with_tools(self, mock_mcp_client, mock_anthropic):
244
- """Test running agent with tool usage."""
245
- agent = ClaudeMCPAgent(mcp_client=mock_mcp_client, model_client=mock_anthropic)
246
-
247
- # Mock tool availability
248
- agent._available_tools = [
249
- types.Tool(name="calculator", description="Calculator", inputSchema={"type": "object"})
250
- ]
251
- agent._tool_map = {
252
- "calculator": (
253
- "server1",
254
- types.Tool(
255
- name="calculator", description="Calculator", inputSchema={"type": "object"}
256
- ),
257
- )
258
- }
259
-
260
- # Mock initial response with tool use
261
- initial_response = MagicMock()
262
- # Create tool use block
263
- tool_block = MagicMock()
264
- tool_block.type = "tool_use"
265
- tool_block.id = "calc_123"
266
- tool_block.name = "calculator"
267
- tool_block.input = {"operation": "add", "a": 2, "b": 3}
268
- initial_response.content = [tool_block]
269
- initial_response.usage = MagicMock(input_tokens=10, output_tokens=15)
270
-
271
- # Mock follow-up response
272
- final_response = MagicMock()
273
- text_block = MagicMock()
274
- text_block.type = "text"
275
- text_block.text = "2 + 3 = 5"
276
- final_response.content = [text_block]
277
- final_response.usage = MagicMock(input_tokens=20, output_tokens=10)
278
-
279
- mock_anthropic.beta.messages.create = AsyncMock(
280
- side_effect=[initial_response, final_response]
281
- )
282
-
283
- # Mock tool execution
284
- agent.mcp_client.call_tool = AsyncMock(
285
- return_value=types.CallToolResult(
286
- content=[types.TextContent(type="text", text="5")], isError=False
287
- )
288
- )
289
-
290
- # Use a string prompt instead of a task
291
- result = await agent.run("What is 2 + 3?")
292
-
293
- assert result.content == "2 + 3 = 5"
294
- assert result.done is True
hud/task.py DELETED
@@ -1,149 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from pathlib import Path
4
- from typing import TYPE_CHECKING, Any, Literal, cast
5
-
6
- from pydantic import BaseModel, Field
7
-
8
- from hud.types import CustomGym, Gym, MetadataKeys, SensitiveData
9
- from hud.utils.common import FunctionConfigs
10
- from hud.utils.deprecation import deprecated
11
-
12
- if TYPE_CHECKING:
13
- from hud.agent import Agent
14
-
15
-
16
- @deprecated(
17
- reason="Task class is being replaced by TaskConfig for better MCP integration",
18
- replacement="hud.datasets.TaskConfig",
19
- version="0.3.0",
20
- removal_version="0.4.0",
21
- )
22
- class Task(BaseModel):
23
- """A task that can be executed and evaluated.
24
-
25
- A Task represents a specific activity to be performed in an environment.
26
- It contains the prompt describing the task and configurations for
27
- setting up and evaluating the environment.
28
-
29
- The setup and evaluate configurations can be in several formats:
30
- - String (function name): "chrome.maximize"
31
- - Tuple (function with args): ("chrome.activate_tab", 5)
32
- - Dict: {"function": "chrome.navigate", "args": ["https://example.com"]}
33
- - List of the above: ["chrome.maximize", {"function": "chrome.navigate", "args": ["https://example.com"]}]
34
-
35
- Attributes:
36
- id: The remote task ID (optional if local-only)
37
- prompt: The task prompt or instruction
38
- system_prompt: The system prompt for the evalset (optional)
39
- setup: Environment setup configuration (optional)
40
- evaluate: Configuration for evaluating responses
41
- metadata: Additional task metadata
42
- sensitive_data: Sensitive data such as API keys, passwords, etc.
43
- choices: Multiple choice answer list (for Inspect compatibility)
44
- target: Ideal target output (for Inspect compatibility)
45
- files: Files that go along with the task (for Inspect compatibility)
46
- gym: Environment specification
47
- """
48
-
49
- id: str | None = None # Remote task ID (optional if local-only)
50
-
51
- prompt: str # Task prompt or instruction
52
- system_prompt: str | None = None # System prompt for the evalset (optional)
53
-
54
- gym: Gym | None = None # Environment specification
55
-
56
- # Setup and evaluate configurations for the environment (environment specific)
57
- setup: FunctionConfigs | None = None
58
- evaluate: FunctionConfigs | None = None
59
-
60
- # Overflow configuration for environments that don't conform to the standard
61
- config: dict[str, Any] | None = None
62
-
63
- # Sensitive data such as API keys, passwords, etc.
64
- sensitive_data: SensitiveData = Field(default_factory=dict)
65
-
66
- # Metadata for the task evaluation, information about the agent (see MetadataKeys)
67
- metadata: dict[MetadataKeys, Any] = Field(default_factory=dict)
68
-
69
- # Description of the task, for extra information about its purpose and context
70
- description: str | None = None
71
-
72
- # Gold file url for the task
73
- gold_file_url: str | None = None
74
-
75
- @classmethod
76
- def from_dict(cls, data: dict[str, Any]) -> Task:
77
- return cls(**data)
78
-
79
- @classmethod
80
- def from_serialized(cls, data: dict[str, Any]) -> Task:
81
- gym_data = data.get("gym")
82
- parsed_gym: Gym | None = gym_data
83
-
84
- parsed_setup = [(param, entry) for param, entry in data.get("setup", [])]
85
- parsed_evaluate = [(param, entry) for param, entry in data.get("evaluate", [])]
86
-
87
- # Convert dict gym data to CustomGym if needed
88
- if (
89
- isinstance(gym_data, dict)
90
- and gym_data.get("type") == "public"
91
- and gym_data.get("location") in ("local", "remote")
92
- and gym_data.get("image_or_build_context") is not None
93
- ):
94
- parsed_gym = CustomGym(
95
- type=cast("Literal['public']", gym_data["type"]),
96
- location=cast("Literal['local', 'remote']", gym_data["location"]),
97
- image_or_build_context=Path(gym_data["image_or_build_context"]),
98
- )
99
-
100
- return cls(
101
- id=data.get("id"),
102
- prompt=data.get("prompt", ""),
103
- system_prompt=data.get("system_prompt"),
104
- setup=parsed_setup,
105
- evaluate=parsed_evaluate,
106
- gym=parsed_gym,
107
- config=data.get("config"),
108
- description=data.get("description"),
109
- sensitive_data=data.get("sensitive_data", {}),
110
- metadata=data.get("metadata", {}),
111
- gold_file_url=data.get("gold_file_url"),
112
- )
113
-
114
- async def fit(self, agent: Agent | type[Agent]) -> None:
115
- if isinstance(agent, type):
116
- agent = agent()
117
-
118
- if self.gym is None:
119
- return
120
- self.gym = agent.transfer_gyms.get(self.gym, self.gym)
121
-
122
- def serialize(self) -> dict[str, Any]:
123
- if isinstance(self.setup, list):
124
- parsed_setup = [[param, entry] for param, entry in self.setup]
125
- else:
126
- parsed_setup = self.setup
127
- if isinstance(self.evaluate, list):
128
- parsed_evaluate = [[param, entry] for param, entry in self.evaluate]
129
- else:
130
- parsed_evaluate = self.evaluate
131
-
132
- if isinstance(self.gym, CustomGym):
133
- parsed_gym = self.gym.model_dump()
134
- parsed_gym["image_or_build_context"] = str(parsed_gym["image_or_build_context"])
135
- else: # is ServerGym
136
- parsed_gym = self.gym
137
-
138
- return {
139
- "id": self.id,
140
- "prompt": self.prompt,
141
- "config": self.config,
142
- "description": self.description,
143
- "setup": parsed_setup,
144
- "evaluate": parsed_evaluate,
145
- "gym": parsed_gym,
146
- "sensitive_data": self.sensitive_data,
147
- "metadata": self.metadata,
148
- "gold_file_url": self.gold_file_url,
149
- }
hud/taskset.py DELETED
@@ -1,237 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- from pathlib import Path
5
- from typing import TYPE_CHECKING, Any, get_args
6
-
7
- from pydantic import BaseModel
8
-
9
- from hud.env.environment import create_remote_config
10
- from hud.server import make_request
11
- from hud.settings import settings
12
- from hud.task import Task
13
- from hud.types import CustomGym, ServerGym
14
- from hud.utils.config import REMOTE_EVALUATE, REMOTE_SETUP
15
- from hud.utils.deprecation import deprecated
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
- if TYPE_CHECKING:
20
- from collections.abc import Iterator
21
-
22
- from hud.agent import Agent
23
-
24
-
25
- @deprecated(
26
- reason="TaskSet class is being replaced by HuggingFace datasets on hud-evals",
27
- replacement="Use TaskConfig-based collections or hud.datasets module",
28
- version="0.3.0",
29
- removal_version="0.4.0",
30
- )
31
- class TaskSet(BaseModel):
32
- """
33
- Collection of related tasks for benchmarking.
34
-
35
- Attributes:
36
- id: Unique identifier for the taskset
37
- name: Name of the taskset
38
- description: Description of the taskset
39
- tasks: List of Task objects in the taskset
40
- """
41
-
42
- id: str | None = None
43
- name: str | None = None
44
- description: str | None = None
45
- tasks: list[Task] = []
46
-
47
- def __getitem__(self, index: int) -> Task:
48
- """
49
- Allows accessing tasks by index using square bracket notation.
50
-
51
- Args:
52
- index: The index of the task to retrieve
53
-
54
- Returns:
55
- Task: The task at the specified index
56
-
57
- Raises:
58
- IndexError: If the index is out of range
59
- """
60
- return self.tasks[index]
61
-
62
- def __len__(self) -> int:
63
- """
64
- Returns the number of tasks in the taskset.
65
-
66
- Returns:
67
- int: The number of tasks in the taskset
68
- """
69
- return len(self.tasks)
70
-
71
- def __iter__(self) -> Iterator[Task]:
72
- """
73
- Returns an iterator over the tasks in the taskset.
74
- """
75
- return iter(self.tasks)
76
-
77
- async def upload(
78
- self,
79
- name: str | None = None,
80
- description: str | None = None,
81
- api_key: str | None = None,
82
- ) -> None:
83
- """
84
- Uploads the taskset to the server.
85
- """
86
- if name is None:
87
- name = self.name
88
-
89
- if name is None:
90
- raise ValueError("Taskset name is required")
91
-
92
- if api_key is None:
93
- api_key = settings.api_key
94
-
95
- # Convert all tasks to expanded configs
96
- processed_tasks = []
97
- for task in self.tasks:
98
- if task.setup is not None:
99
- setup_config = (
100
- create_remote_config(None, task.setup, REMOTE_SETUP)[0].args[0].model_dump()
101
- )
102
- else:
103
- setup_config = None
104
- if task.evaluate is not None:
105
- evaluate_config = (
106
- create_remote_config(None, task.evaluate, REMOTE_EVALUATE)[0]
107
- .args[0]
108
- .model_dump()
109
- )
110
- else:
111
- evaluate_config = None
112
-
113
- if isinstance(task.gym, CustomGym):
114
- if isinstance(task.gym.image_or_build_context, Path):
115
- raise ValueError(
116
- "Local build contexts are not supported for "
117
- "remote tasksets, attach an image or existing "
118
- "gym id."
119
- )
120
- gym_str = "docker"
121
- image_uri = task.gym.image_or_build_context
122
- elif isinstance(task.gym, str) and task.gym in get_args(ServerGym):
123
- gym_str = task.gym
124
- image_uri = None
125
- else:
126
- raise ValueError(f"Unknown gym type: {type(task.gym)}")
127
-
128
- processed_tasks.append(
129
- {
130
- "prompt": task.prompt,
131
- "gym": gym_str,
132
- "setup": setup_config,
133
- "evaluate": evaluate_config,
134
- "config": task.config,
135
- "image_uri": image_uri,
136
- "description": task.description,
137
- }
138
- )
139
-
140
- await make_request(
141
- method="POST",
142
- url=f"{settings.base_url}/v2/tasksets",
143
- api_key=api_key,
144
- json={
145
- "name": name,
146
- "description": description,
147
- "tasks": processed_tasks,
148
- },
149
- )
150
- logger.info(
151
- "Taskset %s uploaded successfully, see it on app.hud.so/evalsets/%s", name, name
152
- )
153
-
154
- def _apply(self, dict: dict[str, Any]) -> None:
155
- """
156
- Applies a parameter to all tasks in the taskset.
157
- """
158
- for task in self.tasks:
159
- for key, value in dict.items():
160
- setattr(task, key, value)
161
-
162
- def fit(self, agent: Agent | type[Agent]) -> None:
163
- """
164
- Automatically adapts the taskset to the agent's transfer_gyms.
165
- """
166
- if isinstance(agent, type):
167
- agent = agent()
168
-
169
- for task in self.tasks:
170
- if task.gym is None or isinstance(task.gym, CustomGym):
171
- continue
172
- task.gym = agent.transfer_gyms.get(task.gym, task.gym)
173
-
174
-
175
- @deprecated(
176
- reason="load_taskset is being replaced by new dataset loading mechanisms for MCP-based tasks",
177
- replacement="Use TaskConfig-based dataset loading from hud.datasets module",
178
- version="0.3.0",
179
- removal_version="0.4.0",
180
- )
181
- async def load_taskset(
182
- taskset_id: str,
183
- api_key: str | None = None,
184
- metadata: dict[str, Any] | None = None,
185
- load_custom_as_local: bool = False,
186
- system_prompt: str | None = None,
187
- ) -> TaskSet:
188
- """
189
- Loads a TaskSet by its ID.
190
-
191
- Args:
192
- taskset_id: The ID of the taskset to load
193
- api_key: Optional API key to use for the request
194
- metadata: Optional metadata to apply to the taskset
195
- load_custom_as_local: Whether to load custom gyms as local
196
- system_prompt: Optional system prompt to override the default
197
- Returns:
198
- TaskSet: The loaded taskset
199
- """
200
-
201
- if api_key is None:
202
- api_key = settings.api_key
203
-
204
- data = await make_request(
205
- method="GET",
206
- url=f"{settings.base_url}/v2/tasksets/{taskset_id}/tasks",
207
- api_key=api_key,
208
- )
209
-
210
- logger.info("Taskset %s loaded successfully", taskset_id)
211
-
212
- tasks = data["evalset"]
213
- for task in tasks:
214
- if system_prompt:
215
- task["system_prompt"] = system_prompt
216
- if task["gym"] == "docker":
217
- if "image_uri" not in task:
218
- raise ValueError(
219
- "No `image_uri` key found. This taskset may be "
220
- "incompatible with your version of HUD SDK."
221
- )
222
-
223
- task["gym"] = CustomGym(
224
- location="local" if load_custom_as_local else "remote",
225
- image_or_build_context=task["image_uri"],
226
- )
227
-
228
- taskset = TaskSet.model_validate(
229
- {
230
- "id": taskset_id,
231
- "tasks": tasks,
232
- }
233
- )
234
-
235
- taskset._apply({"metadata": metadata})
236
-
237
- return taskset